From: xiongweimin <[email protected]> This commit adds core RDMA verb implementations for the virtio RDMA driver:
1. Post Receive Support: - Full handling of recv_wr chains with SGE conversion - SMI QP rejection and user-space QP fast path - Atomic buffer allocation with GFP_ATOMIC 2. Post Send Support: - Comprehensive opcode support including RDMA/Atomic/UD - Inline data handling via contiguous copy - Detailed error handling with bad_wr tracking - Memory registration support integration Key features: - Support for 15+ IB_WR_OPCODE types - Specialized handling for UD/RC/GSI QP types - Kernel-space WR processing with virtio command conversion - Virtqueue batching optimizations - Strict concurrency control with QP locks Signed-off-by: Xiong Weimin <[email protected]> --- .../drivers/infiniband/hw/virtio/vrdma_abi.h | 99 ++++-- .../drivers/infiniband/hw/virtio/vrdma_ib.c | 310 +++++++++++++++++- 2 files changed, 372 insertions(+), 37 deletions(-) diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h index 7cdc4e488..0a9404057 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h @@ -222,6 +222,19 @@ struct vrdma_av { __u8 reserved[6]; /* Reserved for future use / alignment padding */ }; +struct vrdma_sge { + __u64 addr; + __u32 length; + __u32 lkey; +}; + +struct vrdma_cmd_post_recv { + __u32 qpn; + __u32 num_sge; + __u64 wr_id; + struct ib_sge *sge_list; +}; + /** * struct vrdma_cmd_post_send - User-space command to post a Send WQE * @@ -232,48 +245,62 @@ struct vrdma_av { * All fields use fixed-size types for ABI stability across architectures. */ struct vrdma_cmd_post_send { - __u32 num_sge; /* Number of scatter-gather elements in this WQE */ - - __u32 send_flags; /* IBV_SEND_xxx flags (e.g., signaled, inline, fence) */ - __u32 opcode; /* Operation code: RDMA_WRITE, SEND, ATOMIC, etc. */ - __u64 wr_id; /* Work Request ID returned in CQE */ - union { - __be32 imm_data; /* Immediate data for RC/UC QPs */ - __u32 invalidate_rkey; /* rkey to invalidate (on SEND_WITH_INV) */ - } ex; - - union wr_data { + /* Length of sg_list */ + __le32 num_sge; + /* Length of inline data */ + __le16 inline_len; + }; + /* Flags of the WR properties */ + __u8 send_flags; + /* WR opcode, enum virtio_ib_wr_opcode */ + __u32 opcode; + /* User defined WR ID */ + __le64 wr_id; +#define VIRTIO_IB_SEND_FENCE (1 << 0) +#define VIRTIO_IB_SEND_SIGNALED (1 << 1) +#define VIRTIO_IB_SEND_SOLICITED (1 << 2) +#define VIRTIO_IB_SEND_INLINE (1 << 3) + /* Immediate data (in network byte order) to send */ + __le32 imm_data; + union { + __le32 imm_data; + __u32 invalidate_rkey; + } ex; + union { struct { - __u64 remote_addr; /* Target virtual address for RDMA op */ - __u32 rkey; /* Remote key for memory access */ + /* Start address of remote memory buffer */ + __le64 remote_addr; + /* Key of the remote MR */ + __le32 rkey; } rdma; - - struct { - __u64 remote_addr; /* Address of atomic variable */ - __u64 compare_add; /* Value to compare */ - __u64 swap; /* Value to swap (or add) */ - __u32 rkey; /* Remote memory key */ - } atomic; - + struct { + __u64 remote_addr; + __u64 compare_add; + __u64 swap; + __u32 rkey; + } atomic; struct { - __u32 remote_qpn; /* Destination QP number */ - __u32 remote_qkey; /* Q_Key for UD packet validation */ - struct vrdma_av av; /* Address vector (L2/L3 info) */ + /* Index of the destination QP */ + __le32 remote_qpn; + /* Q_Key of the destination QP */ + __le32 remote_qkey; + struct vrdma_av av; } ud; - - struct { - __u32 mrn; /* Memory Region Number (MR handle) */ - __u32 key; /* Staging rkey for MR registration */ - __u32 access; /* Access flags (IB_ACCESS_xxx) */ - } reg; + struct { + __u32 mrn; + __u32 key; + int access; + } reg; + /* Reserved for future */ + __le64 reserved[4]; } wr; -}; - -struct vrdma_sge { - __u64 addr; - __u32 length; - __u32 lkey; + /* Inline data */ + //__u8 inline_data[512]; + /* Reserved for future */ + __le32 reserved2[3]; + /* Scatter/gather list */ + struct vrdma_sge sg_list[]; }; #endif diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c index 705d18b55..f9b129774 100644 --- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c +++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c @@ -2215,6 +2215,312 @@ static int vrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) return i; /* Return number of polled completions */ } +/** + * vrdma_post_recv - Post a list of receive work requests + * @ibqp: Queue pair + * @wr: List of receive work requests + * @bad_wr: Out parameter pointing to first failed WR on error + * + * Submits receive buffers to the backend via virtqueue. + * Each WR is serialized into a command structure and passed to the host. + * + * Context: Process context (may be called from atomic context in rare cases). + * Return: + * * 0 on success + * * negative errno on failure (e.g., -ENOMEM, -EOPNOTSUPP) + */ +static int vrdma_post_recv(struct ib_qp *ibqp, + const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct vrdma_qp *vqp = to_vqp(ibqp); + struct vrdma_cmd_post_recv *cmd; + unsigned int sgl_size; + int rc = 0; + struct scatterlist hdr; + struct scatterlist *sgs[1]; + unsigned long flags; + + /* SMI QPs are not supported */ + if (ibqp->qp_type == IB_QPT_SMI) { + *bad_wr = wr; + return -EOPNOTSUPP; + } + + /* + * For user-space QPs, we assume recv posting is handled differently + * (e.g., through mmap'ed rings). Skip kernel-side posting. + */ + if (vqp->type == VIRTIO_RDMA_TYPE_USER) + goto kick_and_return; + + /* Serialize access to RQ */ + spin_lock_irqsave(&vqp->rq->lock, flags); + + while (wr) { + /* Validate required fields */ + if (unlikely(!wr->num_sge)) { + rc = -EINVAL; + goto out_bad_wr; + } + + /* Calculate size of SGE array to copy */ + sgl_size = sizeof(struct vrdma_sge) * wr->num_sge; + cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC); + if (!cmd) { + rc = -ENOMEM; + goto out_bad_wr; + } + + cmd->sge_list = kzalloc(sgl_size, GFP_ATOMIC); + if (!cmd->sge_list) { + rc = -ENOMEM; + goto out_bad_wr; + } + + /* Fill command */ + cmd->qpn = vqp->qp_handle; + cmd->wr_id = (ibqp->qp_type == IB_QPT_GSI) ? 0 : wr->wr_id; + cmd->num_sge = wr->num_sge; + + /* Copy SGEs from user WR into command buffer */ + memcpy(cmd->sge_list, wr->sg_list, sgl_size); + + /* Prepare scatterlist for virtqueue */ + sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size); + sgs[0] = &hdr; + + /* Add to virtqueue */ + rc = virtqueue_add_sgs(vqp->rq->vq, sgs, 1, 0, cmd, GFP_ATOMIC); + if (rc) { + kfree(cmd); + goto out_bad_wr; + } + + wr = wr->next; + } + + spin_unlock_irqrestore(&vqp->rq->lock, flags); + +kick_and_return: + virtqueue_kick(vqp->rq->vq); + return 0; + +out_bad_wr: + *bad_wr = wr; + spin_unlock_irqrestore(&vqp->rq->lock, flags); + virtqueue_kick(vqp->rq->vq); /* Still kick so backend knows partial update */ + return rc; +} + +/** + * copy_inline_data_to_wqe - Copy inline data from SGEs into WQE buffer + * @wqe: Pointer to the vrdma_cmd_post_send command structure + * @ibwr: IB send work request containing SGEs with inline data + * + * Copies all data referenced by SGEs into a contiguous area immediately + * following the WQE header, typically used when IB_SEND_INLINE is set. + * + * Assumes: + * - Memory at sge->addr is accessible (kernel virtual address) + * - Total size <= device max_inline_data + * - wqe has enough tailroom for all data + * + * Context: Called under spinlock, atomic context (GFP_ATOMIC allocation) + */ +static void vrdma_copy_inline_data_to_wqe(struct vrdma_cmd_post_send *wqe, + const struct ib_send_wr *ibwr) +{ + const struct ib_sge *sge; + char *dst = (char *)wqe + sizeof(*wqe); /* Start after header */ + int i; + + for (i = 0; i < ibwr->num_sge; i++) { + sge = &ibwr->sg_list[i]; + + /* Skip zero-length segments */ + if (sge->length == 0) + continue; + + /* + * WARNING: sge->addr is a user-space or kernel virtual address. + * Using (void *)(uintptr_t)sge->addr assumes it's directly dereferenceable. + * This is only valid if: + * - The QP is KERNEL type AND + * - The memory was registered and we trust its mapping + */ + + memcpy(dst, (void *)(uintptr_t)sge->addr, sge->length); + dst += sge->length; + } +} + +/** + * vrdma_post_send - Post a list of send work requests to the SQ + * @ibqp: Queue pair + * @wr: List of work requests + * @bad_wr: Out parameter pointing to failing WR on error + * + * Converts each ib_send_wr into a vrdma_cmd_post_send and submits it + * via the send virtqueue. Supports both kernel and user QPs. + * + * Context: Process context (may hold spinlock, so no sleep in atomic section) + * Return: + * * 0 on success + * * negative errno on failure (e.g., -EINVAL, -ENOMEM) + * * @bad_wr set to first failed WR + */ +static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct vrdma_qp *vqp = to_vqp(ibqp); + struct vrdma_cmd_post_send *cmd; + unsigned int sgl_size; + int rc = 0; + struct scatterlist hdr; + struct scatterlist *sgs[1]; + + /* Fast path for user-space QP: defer to userspace */ + if (vqp->type == VIRTIO_RDMA_TYPE_USER) { + virtqueue_kick(vqp->sq->vq); + return 0; + } + + spin_lock(&vqp->sq->lock); + + while (wr) { + *bad_wr = wr; /* In case of error */ + + /* Validate opcode support in kernel QP */ + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + case IB_WR_SEND_WITH_INV: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + case IB_WR_RDMA_READ: + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_LOCAL_INV: + case IB_WR_REG_MR: + break; + default: + pr_warn("vRDMA: unsupported opcode %d for kernel QP\n", + wr->opcode); + rc = -EINVAL; + goto out_unlock; + } + + /* Allocate command buffer including space for SGEs */ + sgl_size = wr->num_sge * sizeof(struct vrdma_sge); + cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC); + if (!cmd) { + rc = -ENOMEM; + goto out_unlock; + } + + /* Fill common fields */ + cmd->wr_id = wr->wr_id; + cmd->num_sge = wr->num_sge; + cmd->send_flags = wr->send_flags; + cmd->opcode = wr->opcode; + + /* Immediate data and invalidation key */ + if (wr->send_flags & IB_SEND_INLINE) { + /* TODO: Check max_inline_data limit */ + vrdma_copy_inline_data_to_wqe(cmd, wr); + } else { + memcpy(cmd->sg_list, wr->sg_list, sgl_size); + } + + /* Handle immediate data (SEND_WITH_IMM, WRITE_WITH_IMM) */ + if (wr->opcode == IB_WR_SEND_WITH_IMM || + wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) + cmd->ex.imm_data = wr->ex.imm_data; + + /* Handle invalidate key (SEND_WITH_INV, LOCAL_INV) */ + if (wr->opcode == IB_WR_SEND_WITH_INV || + wr->opcode == IB_WR_LOCAL_INV) + cmd->ex.invalidate_rkey = wr->ex.invalidate_rkey; + + /* RDMA and Atomic specific fields */ + switch (ibqp->qp_type) { + case IB_QPT_RC: + switch (wr->opcode) { + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + cmd->wr.rdma.remote_addr = rdma_wr(wr)->remote_addr; + cmd->wr.rdma.rkey = rdma_wr(wr)->rkey; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + cmd->wr.atomic.remote_addr = atomic_wr(wr)->remote_addr; + cmd->wr.atomic.rkey = atomic_wr(wr)->rkey; + cmd->wr.atomic.compare_add = atomic_wr(wr)->compare_add; + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) + cmd->wr.atomic.swap = atomic_wr(wr)->swap; + break; + + case IB_WR_REG_MR: { + const struct ib_reg_wr *reg = reg_wr(wr); + struct vrdma_mr *vmr = to_vmr(reg->mr); + cmd->wr.reg.mrn = vmr->mr_handle; + cmd->wr.reg.key = reg->key; + cmd->wr.reg.access = reg->access; + break; + } + default: + break; + } + break; + + case IB_QPT_UD: + case IB_QPT_GSI: { + if (!ud_wr(wr)->ah) { + pr_warn("vRDMA: invalid address handle in UD WR\n"); + kfree(cmd); + rc = -EINVAL; + goto out_unlock; + } + cmd->wr.ud.remote_qpn = ud_wr(wr)->remote_qpn; + cmd->wr.ud.remote_qkey = ud_wr(wr)->remote_qkey; + cmd->wr.ud.av = to_vah(ud_wr(wr)->ah)->av; + break; + } + + default: + pr_err("vRDMA: unsupported QP type %d\n", ibqp->qp_type); + kfree(cmd); + rc = -EINVAL; + goto out_unlock; + } + + /* Prepare scatterlist for virtqueue */ + sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size); + sgs[0] = &hdr; + + rc = virtqueue_add_sgs(vqp->sq->vq, sgs, 1, 0, cmd, GFP_ATOMIC); + if (rc) { + dev_err(&vqp->sq->vq->vdev->dev, + "vRDMA: failed to add send WR to vq: %d\n", rc); + kfree(cmd); + goto out_unlock; + } + + /* Advance to next WR */ + wr = wr->next; + } + +out_unlock: + spin_unlock(&vqp->sq->lock); + + /* Only kick after successful submission(s), but always try to kick */ + virtqueue_kick(vqp->sq->vq); + return rc; +} + static const struct ib_device_ops vrdma_dev_ops = { .owner = THIS_MODULE, .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION, @@ -2246,7 +2552,9 @@ static const struct ib_device_ops vrdma_dev_ops = { .mmap_free = vrdma_mmap_free, .modify_port = vrdma_modify_port, .modify_qp = vrdma_modify_qp, - .poll_cq = vrdma_poll_cq, + .poll_cq = vrdma_poll_cq, + .post_recv = vrdma_post_recv, + .post_send = vrdma_post_send, }; /** -- 2.43.0
