From: xiongweimin <[email protected]> This commit adds support for essential RDMA resource management verbs:
1. P_Key Table Query: - Implements IB_QUERY_PKEY verb for partition key retrieval - Handles endianness conversion for cross-platform compatibility - Provides complete error handling for device communication failures 2. QP Attribute Query: - Full QP state retrieval including capabilities and AH attributes - Byte order handling for all struct fields - Init attribute preservation for consistency checks - Detailed error logging for debugging 3. User Memory Registration: - Memory pinning via ib_umem_get() with access flag enforcement - DMA-safe page table construction and bulk transfer to device - Multi-architecture DMA address handling - Strict memory boundary validation - Resource cleanup guarantees on all error paths Key enhancements: - Unified virtqueue command infrastructure - Cross-architecture endianness handling - Atomic page table transfer for registered memory regions - Protection domain integration for memory access control - Error injection points for robust resource recovery Signed-off-by: Xiong Weimin <[email protected]> --- .../infiniband/hw/virtio/vrdma_dev_api.h | 35 ++ .../drivers/infiniband/hw/virtio/vrdma_ib.c | 333 +++++++++++++++++- 2 files changed, 367 insertions(+), 1 deletion(-) diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h index d0ce02601..86b5ecade 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h @@ -225,6 +225,41 @@ struct vrdma_rsp_modify_qp { __u32 qpn; }; +struct vrdma_cmd_query_pkey { + __u32 port; + __u16 index; +}; + +struct vrdma_rsp_query_pkey { + __u16 pkey; +}; + +struct vrdma_cmd_query_qp { + __u32 qpn; + __u32 attr_mask; +}; + +struct vrdma_rsp_query_qp { + struct vrdma_qp_attr attr; +}; + +struct vrdma_cmd_reg_user_mr { + __u32 pdn; + __u32 access_flags; + __u64 start; + __u64 length; + __u64 virt_addr; + + __u64 pages; + __u32 npages; +}; + +struct vrdma_rsp_reg_user_mr { + __u32 mrn; + __u32 lkey; + __u32 rkey; +}; + #define VRDMA_CTRL_OK 0 #define VRDMA_CTRL_ERR 1 diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c index f9b129774..b1429e072 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c @@ -23,6 +23,7 @@ #include "vrdma_queue.h" #define VRTIO_RDMA_PAGE_PER_TBL 512 +#define VRDMA_MAX_PAGES 512 * 512 /** * cmd_str - String representation of virtio RDMA control commands @@ -86,6 +87,36 @@ static void rdma_ah_attr_to_vrdma(struct vrdma_ah_attr *dst, memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr)); } +static void vrdma_to_ib_global_route(struct ib_global_route *dst, + const struct vrdma_global_route *src) +{ + dst->dgid = src->dgid; + dst->flow_label = src->flow_label; + dst->sgid_index = src->sgid_index; + dst->hop_limit = src->hop_limit; + dst->traffic_class = src->traffic_class; +} + +static void vrdma_to_ib_qp_cap(struct ib_qp_cap *dst, const struct vrdma_qp_cap *src) +{ + dst->max_send_wr = src->max_send_wr; + dst->max_recv_wr = src->max_recv_wr; + dst->max_send_sge = src->max_send_sge; + dst->max_recv_sge = src->max_recv_sge; + dst->max_inline_data = src->max_inline_data; +} + +static void vrdma_to_rdma_ah_attr(struct rdma_ah_attr *dst, + const struct vrdma_ah_attr *src) +{ + vrdma_to_ib_global_route(rdma_ah_retrieve_grh(dst), &src->grh); + rdma_ah_set_sl(dst, src->sl); + rdma_ah_set_static_rate(dst, src->static_rate); + rdma_ah_set_port_num(dst, src->port_num); + rdma_ah_set_ah_flags(dst, src->ah_flags); + memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr)); +} + /** * vrdma_exec_verbs_cmd - Execute a verbs command via control virtqueue * @vrdev: VRDMA device @@ -2521,6 +2552,303 @@ static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, return rc; } +/** + * vrdma_query_pkey - Query Partition Key (P_Key) at given index + * @ibdev: Verbs device (vRDMA virtual device) + * @port: Port number (1-indexed) + * @index: P_Key table index + * @pkey: Output buffer to store the P_Key value + * + * Queries the P_Key from the backend via virtqueue command. + * Only meaningful for IB-style ports (not RoCE). + * + * Context: Process context (may sleep). Can be called from user IOCTL path. + * Return: + * * 0 on success + * * -ENOMEM if command allocation fails + * * -EIO or other negative errno on communication failure + */ +static int vrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey) +{ + struct vrdma_dev *vdev = to_vdev(ibdev); + struct vrdma_cmd_query_pkey *cmd; + struct vrdma_rsp_query_pkey *rsp; + struct scatterlist in, out; + int rc; + + /* Allocate command and response buffers */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + kfree(cmd); + return -ENOMEM; + } + + /* Fill input parameters */ + cmd->port = cpu_to_le32(port); + cmd->index = cpu_to_le16(index); + + /* Prepare scatterlists for virtqueue I/O */ + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Execute command */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_PKEY, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_QUERY_PKEY failed: port=%u idx=%u err=%d\n", + port, index, rc); + goto out_free; + } + + /* Copy result to user */ + *pkey = le16_to_cpu(rsp->pkey); + +out_free: + kfree(rsp); + kfree(cmd); + return rc; +} + +/** + * vrdma_query_qp - Query QP attributes from the backend + * @ibqp: Queue pair to query + * @attr: Output structure for QP attributes + * @attr_mask: Which fields are requested (ignored by some backends) + * @init_attr: Output structure for init-time attributes + * + * Queries the QP state and configuration via a control virtqueue command. + * This is a synchronous operation. + * + * Context: Process context (can sleep) + * Return: + * * 0 on success + * * -ENOMEM if allocation fails + * * -EIO or other negative errno on communication failure + */ +static int vrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_qp_init_attr *init_attr) +{ + struct vrdma_qp *vqp = to_vqp(ibqp); + struct vrdma_dev *vdev = to_vdev(ibqp->device); + struct vrdma_cmd_query_qp *cmd; + struct vrdma_rsp_query_qp *rsp; + struct scatterlist in, out; + int rc; + + /* Allocate command and response buffers */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + if (!rsp) { + kfree(cmd); + return -ENOMEM; + } + + /* Fill input parameters */ + cmd->qpn = cpu_to_le32(vqp->qp_handle); + cmd->attr_mask = cpu_to_le32(attr_mask); /* Optional optimization */ + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Execute command over control virtqueue */ + rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_QP, &in, &out); + if (rc) { + dev_err(&vdev->vdev->dev, + "VIRTIO_RDMA_CMD_QUERY_QP failed: qpn=0x%x err=%d\n", + vqp->qp_handle, rc); + goto out_free; + } + + /* Only copy results on success */ + attr->qp_state = rsp->attr.qp_state; + attr->cur_qp_state = rsp->attr.cur_qp_state; + attr->path_mtu = rsp->attr.path_mtu; + attr->path_mig_state = rsp->attr.path_mig_state; + attr->qkey = le32_to_cpu(rsp->attr.qkey); + attr->rq_psn = le32_to_cpu(rsp->attr.rq_psn); + attr->sq_psn = le32_to_cpu(rsp->attr.sq_psn); + attr->dest_qp_num = le32_to_cpu(rsp->attr.dest_qp_num); + attr->qp_access_flags = le32_to_cpu(rsp->attr.qp_access_flags); + attr->pkey_index = le16_to_cpu(rsp->attr.pkey_index); + attr->alt_pkey_index = le16_to_cpu(rsp->attr.alt_pkey_index); + attr->en_sqd_async_notify = rsp->attr.en_sqd_async_notify; + attr->sq_draining = rsp->attr.sq_draining; + attr->max_rd_atomic = rsp->attr.max_rd_atomic; + attr->max_dest_rd_atomic = rsp->attr.max_dest_rd_atomic; + attr->min_rnr_timer = rsp->attr.min_rnr_timer; + attr->port_num = rsp->attr.port_num; + attr->timeout = rsp->attr.timeout; + attr->retry_cnt = rsp->attr.retry_cnt; + attr->rnr_retry = rsp->attr.rnr_retry; + attr->alt_port_num = rsp->attr.alt_port_num; + attr->alt_timeout = rsp->attr.alt_timeout; + attr->rate_limit = le32_to_cpu(rsp->attr.rate_limit); + + /* Copy capabilities */ + vrdma_to_ib_qp_cap(&attr->cap, &rsp->attr.cap); + + /* Convert AH attributes (contains GRH + DIP) */ + vrdma_to_rdma_ah_attr(&attr->ah_attr, &rsp->attr.ah_attr); + vrdma_to_rdma_ah_attr(&attr->alt_ah_attr, &rsp->attr.alt_ah_attr); + + /* Fill init attributes (mostly static) */ + init_attr->event_handler = vqp->ibqp.event_handler; + init_attr->qp_context = vqp->ibqp.qp_context; + init_attr->send_cq = vqp->ibqp.send_cq; + init_attr->recv_cq = vqp->ibqp.recv_cq; + init_attr->srq = vqp->ibqp.srq; + init_attr->xrcd = NULL; /* Not supported in vRDMA */ + init_attr->cap = attr->cap; + init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; /* Or driver default */ + init_attr->qp_type = vqp->ibqp.qp_type; + init_attr->create_flags = 0; + init_attr->port_num = vqp->port; + +out_free: + kfree(rsp); + kfree(cmd); + return rc; +} + +/** + * vrdma_reg_user_mr - Register a user memory region + * @pd: Protection domain + * @start: User virtual address of memory to register + * @length: Length of memory region + * virt_addr: Optional virtual address for rkey access (often same as start) + * @access_flags: Access permissions (IB_ACCESS_xxx) + * @udata: User data (optional, unused here) + * + * Locks down user pages, builds page table, and registers MR with backend. + * Returns pointer to ib_mr or ERR_PTR on failure. + * + * Context: Process context (may sleep during ib_umem_get) + * Return: + * * Pointer to &mr->ibmr on success + * * ERR_PTR(-errno) on failure + */ +static struct ib_mr *vrdma_reg_user_mr(struct ib_pd *pd, u64 start, + u64 length, u64 virt_addr, + int access_flags, + struct ib_udata *udata) +{ + struct vrdma_dev *dev = to_vdev(pd->device); + struct vrdma_cmd_reg_user_mr *cmd; + struct vrdma_rsp_reg_user_mr *rsp; + struct vrdma_mr *mr; + struct ib_umem *umem; + struct sg_dma_page_iter sg_iter; + struct scatterlist in, out; + int rc = 0; + unsigned npages; + dma_addr_t *pages_flat = NULL; + + /* Step 1: Pin user memory pages */ + umem = ib_umem_get(pd->device, start, length, access_flags); + if (IS_ERR(umem)) { + dev_err(&dev->vdev->dev, "Failed to pin user memory: va=0x%llx len=%llu\n", + start, length); + return ERR_CAST(umem); + } + + npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > VRDMA_MAX_PAGES) { // e.g., VRDMA_MAX_PAGES = 512*512 + dev_err(&dev->vdev->dev, "Invalid number of pages: %u\n", npages); + rc = -EINVAL; + goto err_umem; + } + + /* Allocate command/response structures (GFP_KERNEL ok in process context) */ + cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); + rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!cmd || !rsp || !mr) { + rc = -ENOMEM; + goto err_alloc; + } + + /* Initialize MR structure */ + mr->umem = umem; + mr->size = length; + mr->iova = virt_addr; + mr->max_pages = npages; + + /* Allocate contiguous DMA-mapped array for page addresses */ + pages_flat = dma_alloc_coherent(&dev->vdev->dev, + npages * sizeof(dma_addr_t), + &mr->dma_pages, GFP_KERNEL); + if (!pages_flat) { + dev_err(&dev->vdev->dev, "Failed to allocate DMA memory for page table\n"); + rc = -ENOMEM; + goto err_alloc; + } + mr->pages_k = &pages_flat; /* Treat as 2D: [i/512][i%512] */ + + /* Fill page table from ib_umem scatterlist */ + mr->npages = 0; + for_each_sg_dma_page(umem->sgt_append.sgt.sgl, &sg_iter, umem->sgt_append.sgt.nents, 0) { + dma_addr_t addr = sg_page_iter_dma_address(&sg_iter); + pages_flat[mr->npages++] = addr; + } + + /* Sanity check: should match ib_umem_num_pages() */ + WARN_ON(mr->npages != npages); + + /* Prepare command */ + cmd->pdn = cpu_to_le32(to_vpd(pd)->pd_handle); + cmd->start = cpu_to_le64(start); + cmd->length = cpu_to_le64(length); + cmd->virt_addr = cpu_to_le64(virt_addr); + cmd->access_flags = cpu_to_le32(access_flags); + cmd->pages = cpu_to_le64(mr->dma_pages); /* DMA address of page array */ + cmd->npages = cpu_to_le32(npages); + + sg_init_one(&in, cmd, sizeof(*cmd)); + sg_init_one(&out, rsp, sizeof(*rsp)); + + /* Send command to backend */ + rc = vrdma_exec_verbs_cmd(dev, VIRTIO_RDMA_CMD_REG_USER_MR, &in, &out); + if (rc) { + dev_err(&dev->vdev->dev, "Backend failed to register MR: %d\n", rc); + goto err_cmd; + } + + /* Copy results from response */ + mr->mr_handle = le32_to_cpu(rsp->mrn); + mr->ibmr.lkey = le32_to_cpu(rsp->lkey); + mr->ibmr.rkey = le32_to_cpu(rsp->rkey); + + /* Cleanup temporary allocations */ + kfree(cmd); + kfree(rsp); + + /* Link MR to PD if needed, initialize other fields */ + mr->ibmr.pd = pd; + mr->ibmr.device = pd->device; + mr->ibmr.type = IB_MR_TYPE_MEM_REG; + mr->ibmr.length = length; + + return &mr->ibmr; + +err_cmd: + dma_free_coherent(&dev->vdev->dev, npages * sizeof(dma_addr_t), + pages_flat, mr->dma_pages); +err_alloc: + kfree(mr); + kfree(rsp); + kfree(cmd); +err_umem: + ib_umem_release(umem); + return ERR_PTR(rc); +} + static const struct ib_device_ops vrdma_dev_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION, @@ -2554,7 +2882,10 @@ static const struct ib_device_ops vrdma_dev_ops = { .modify_qp = vrdma_modify_qp, .poll_cq = vrdma_poll_cq, .post_recv = vrdma_post_recv, - .post_send = vrdma_post_send, + .post_send = vrdma_post_send, + .query_pkey = vrdma_query_pkey, + .query_qp = vrdma_query_qp, + .reg_user_mr = vrdma_reg_user_mr, }; /** -- 2.43.0
