From: xiongweimin <[email protected]>

This commit adds core RDMA verb implementations for the virtio RDMA driver:

1. Post Receive Support:
   - Full handling of recv_wr chains with SGE conversion
   - SMI QP rejection and user-space QP fast path
   - Atomic buffer allocation with GFP_ATOMIC

2. Post Send Support:
   - Comprehensive opcode support including RDMA/Atomic/UD
   - Inline data handling via contiguous copy
   - Detailed error handling with bad_wr tracking
   - Memory registration support integration

Key features:
- Support for 15+ IB_WR_OPCODE types
- Specialized handling for UD/RC/GSI QP types
- Kernel-space WR processing with virtio command conversion
- Virtqueue batching optimizations
- Strict concurrency control with QP locks

Signed-off-by: Xiong Weimin <[email protected]>
---
 .../drivers/infiniband/hw/virtio/vrdma_abi.h  |  99 ++++--
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 310 +++++++++++++++++-
 2 files changed, 372 insertions(+), 37 deletions(-)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
index 7cdc4e488..0a9404057 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
@@ -222,6 +222,19 @@ struct vrdma_av {
        __u8 reserved[6];               /* Reserved for future use / alignment 
padding */
 };
 
+struct vrdma_sge {
+    __u64 addr;
+    __u32 length;
+    __u32 lkey;
+};
+
+struct vrdma_cmd_post_recv {
+       __u32 qpn;
+       __u32 num_sge;
+       __u64 wr_id;
+       struct ib_sge *sge_list;
+};
+
 /**
  * struct vrdma_cmd_post_send - User-space command to post a Send WQE
  *
@@ -232,48 +245,62 @@ struct vrdma_av {
  * All fields use fixed-size types for ABI stability across architectures.
  */
 struct vrdma_cmd_post_send {
-       __u32 num_sge;          /* Number of scatter-gather elements in this 
WQE */
-
-       __u32 send_flags;       /* IBV_SEND_xxx flags (e.g., signaled, inline, 
fence) */
-       __u32 opcode;           /* Operation code: RDMA_WRITE, SEND, ATOMIC, 
etc. */
-       __u64 wr_id;            /* Work Request ID returned in CQE */
-
        union {
-               __be32  imm_data;               /* Immediate data for RC/UC QPs 
*/
-               __u32   invalidate_rkey;        /* rkey to invalidate (on 
SEND_WITH_INV) */
-       } ex;
-
-       union wr_data {
+               /* Length of sg_list */
+               __le32 num_sge;
+               /* Length of inline data */
+               __le16 inline_len;
+       };
+       /* Flags of the WR properties */
+       __u8 send_flags;
+       /* WR opcode, enum virtio_ib_wr_opcode */
+       __u32 opcode;
+       /* User defined WR ID */
+       __le64 wr_id;   
+#define VIRTIO_IB_SEND_FENCE        (1 << 0)
+#define VIRTIO_IB_SEND_SIGNALED     (1 << 1)
+#define VIRTIO_IB_SEND_SOLICITED    (1 << 2)
+#define VIRTIO_IB_SEND_INLINE       (1 << 3)
+       /* Immediate data (in network byte order) to send */
+       __le32 imm_data;
+    union {
+        __le32 imm_data;
+        __u32 invalidate_rkey;
+    } ex;      
+       union {
                struct {
-                       __u64 remote_addr;      /* Target virtual address for 
RDMA op */
-                       __u32 rkey;             /* Remote key for memory access 
*/
+                       /* Start address of remote memory buffer */
+                       __le64 remote_addr;
+                       /* Key of the remote MR */
+                       __le32 rkey;
                } rdma;
-
-               struct {
-                       __u64 remote_addr;      /* Address of atomic variable */
-                       __u64 compare_add;      /* Value to compare */
-                       __u64 swap;             /* Value to swap (or add) */
-                       __u32 rkey;             /* Remote memory key */
-               } atomic;
-
+        struct {
+            __u64 remote_addr;
+            __u64 compare_add;
+            __u64 swap;
+            __u32 rkey;
+        } atomic;              
                struct {
-                       __u32 remote_qpn;       /* Destination QP number */
-                       __u32 remote_qkey;      /* Q_Key for UD packet 
validation */
-                       struct vrdma_av av;     /* Address vector (L2/L3 info) 
*/
+                       /* Index of the destination QP */
+                       __le32 remote_qpn;
+                       /* Q_Key of the destination QP */
+                       __le32 remote_qkey;
+                       struct vrdma_av av;
                } ud;
-
-               struct {
-                       __u32 mrn;              /* Memory Region Number (MR 
handle) */
-                       __u32 key;              /* Staging rkey for MR 
registration */
-                       __u32 access;           /* Access flags (IB_ACCESS_xxx) 
*/
-               } reg;
+        struct {
+            __u32 mrn;
+            __u32 key;
+            int access;
+        } reg;         
+               /* Reserved for future */
+               __le64 reserved[4];
        } wr;
-};
-
-struct vrdma_sge {
-    __u64 addr;
-    __u32 length;
-    __u32 lkey;
+       /* Inline data */
+       //__u8 inline_data[512];
+       /* Reserved for future */
+       __le32 reserved2[3];
+       /* Scatter/gather list */
+       struct vrdma_sge sg_list[];
 };
 
 #endif
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c 
b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index 705d18b55..f9b129774 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -2215,6 +2215,312 @@ static int vrdma_poll_cq(struct ib_cq *ibcq, int 
num_entries, struct ib_wc *wc)
        return i; /* Return number of polled completions */
 }
 
+/**
+ * vrdma_post_recv - Post a list of receive work requests
+ * @ibqp:      Queue pair
+ * @wr:                List of receive work requests
+ * @bad_wr:    Out parameter pointing to first failed WR on error
+ *
+ * Submits receive buffers to the backend via virtqueue.
+ * Each WR is serialized into a command structure and passed to the host.
+ *
+ * Context: Process context (may be called from atomic context in rare cases).
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -ENOMEM, -EOPNOTSUPP)
+ */
+static int vrdma_post_recv(struct ib_qp *ibqp,
+                                const struct ib_recv_wr *wr,
+                                const struct ib_recv_wr **bad_wr)
+{
+       struct vrdma_qp *vqp = to_vqp(ibqp);
+       struct vrdma_cmd_post_recv *cmd;
+       unsigned int sgl_size;
+       int rc = 0;
+       struct scatterlist hdr;
+       struct scatterlist *sgs[1];
+       unsigned long flags;
+
+       /* SMI QPs are not supported */
+       if (ibqp->qp_type == IB_QPT_SMI) {
+               *bad_wr = wr;
+               return -EOPNOTSUPP;
+       }
+
+       /*
+        * For user-space QPs, we assume recv posting is handled differently
+        * (e.g., through mmap'ed rings). Skip kernel-side posting.
+        */
+       if (vqp->type == VIRTIO_RDMA_TYPE_USER)
+               goto kick_and_return;
+
+       /* Serialize access to RQ */
+       spin_lock_irqsave(&vqp->rq->lock, flags);
+
+       while (wr) {
+               /* Validate required fields */
+               if (unlikely(!wr->num_sge)) {
+                       rc = -EINVAL;
+                       goto out_bad_wr;
+               }
+
+               /* Calculate size of SGE array to copy */
+               sgl_size = sizeof(struct vrdma_sge) * wr->num_sge;
+               cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+               if (!cmd) {
+                       rc = -ENOMEM;
+                       goto out_bad_wr;
+               }
+
+               cmd->sge_list = kzalloc(sgl_size, GFP_ATOMIC);
+               if (!cmd->sge_list) {
+                       rc = -ENOMEM;
+                       goto out_bad_wr;
+               }               
+
+               /* Fill command */
+               cmd->qpn = vqp->qp_handle;
+               cmd->wr_id = (ibqp->qp_type == IB_QPT_GSI) ? 0 : wr->wr_id;
+               cmd->num_sge = wr->num_sge;
+
+               /* Copy SGEs from user WR into command buffer */
+               memcpy(cmd->sge_list, wr->sg_list, sgl_size);
+
+               /* Prepare scatterlist for virtqueue */
+               sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+               sgs[0] = &hdr;
+
+               /* Add to virtqueue */
+               rc = virtqueue_add_sgs(vqp->rq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+               if (rc) {
+                       kfree(cmd);
+                       goto out_bad_wr;
+               }
+
+               wr = wr->next;
+       }
+
+       spin_unlock_irqrestore(&vqp->rq->lock, flags);
+
+kick_and_return:
+       virtqueue_kick(vqp->rq->vq);
+       return 0;
+
+out_bad_wr:
+       *bad_wr = wr;
+       spin_unlock_irqrestore(&vqp->rq->lock, flags);
+       virtqueue_kick(vqp->rq->vq); /* Still kick so backend knows partial 
update */
+       return rc;
+}
+
+/**
+ * copy_inline_data_to_wqe - Copy inline data from SGEs into WQE buffer
+ * @wqe:       Pointer to the vrdma_cmd_post_send command structure
+ * @ibwr:      IB send work request containing SGEs with inline data
+ *
+ * Copies all data referenced by SGEs into a contiguous area immediately
+ * following the WQE header, typically used when IB_SEND_INLINE is set.
+ *
+ * Assumes:
+ * - Memory at sge->addr is accessible (kernel virtual address)
+ * - Total size <= device max_inline_data
+ * - wqe has enough tailroom for all data
+ *
+ * Context: Called under spinlock, atomic context (GFP_ATOMIC allocation)
+ */
+static void vrdma_copy_inline_data_to_wqe(struct vrdma_cmd_post_send *wqe,
+                                   const struct ib_send_wr *ibwr)
+{
+       const struct ib_sge *sge;
+       char *dst = (char *)wqe + sizeof(*wqe); /* Start after header */
+       int i;
+
+       for (i = 0; i < ibwr->num_sge; i++) {
+               sge = &ibwr->sg_list[i];
+
+               /* Skip zero-length segments */
+               if (sge->length == 0)
+                       continue;
+
+               /*
+                * WARNING: sge->addr is a user-space or kernel virtual address.
+                * Using (void *)(uintptr_t)sge->addr assumes it's directly 
dereferenceable.
+                * This is only valid if:
+                *   - The QP is KERNEL type AND
+                *   - The memory was registered and we trust its mapping
+                */
+
+               memcpy(dst, (void *)(uintptr_t)sge->addr, sge->length);
+               dst += sge->length;
+       }
+}
+
+/**
+ * vrdma_post_send - Post a list of send work requests to the SQ
+ * @ibqp:      Queue pair
+ * @wr:                List of work requests
+ * @bad_wr:    Out parameter pointing to failing WR on error
+ *
+ * Converts each ib_send_wr into a vrdma_cmd_post_send and submits it
+ * via the send virtqueue. Supports both kernel and user QPs.
+ *
+ * Context: Process context (may hold spinlock, so no sleep in atomic section)
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -EINVAL, -ENOMEM)
+ * * @bad_wr set to first failed WR
+ */
+static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+                          const struct ib_send_wr **bad_wr)
+{
+       struct vrdma_qp *vqp = to_vqp(ibqp);
+       struct vrdma_cmd_post_send *cmd;
+       unsigned int sgl_size;
+       int rc = 0;
+       struct scatterlist hdr;
+       struct scatterlist *sgs[1];
+
+       /* Fast path for user-space QP: defer to userspace */
+       if (vqp->type == VIRTIO_RDMA_TYPE_USER) {
+               virtqueue_kick(vqp->sq->vq);
+               return 0;
+       }
+
+       spin_lock(&vqp->sq->lock);
+
+       while (wr) {
+               *bad_wr = wr; /* In case of error */
+
+               /* Validate opcode support in kernel QP */
+               switch (wr->opcode) {
+               case IB_WR_SEND:
+               case IB_WR_SEND_WITH_IMM:
+               case IB_WR_SEND_WITH_INV:
+               case IB_WR_RDMA_WRITE:
+               case IB_WR_RDMA_WRITE_WITH_IMM:
+               case IB_WR_RDMA_READ:
+               case IB_WR_ATOMIC_CMP_AND_SWP:
+               case IB_WR_ATOMIC_FETCH_AND_ADD:
+               case IB_WR_LOCAL_INV:
+               case IB_WR_REG_MR:
+                       break;
+               default:
+                       pr_warn("vRDMA: unsupported opcode %d for kernel QP\n",
+                               wr->opcode);
+                       rc = -EINVAL;
+                       goto out_unlock;
+               }
+
+               /* Allocate command buffer including space for SGEs */
+               sgl_size = wr->num_sge * sizeof(struct vrdma_sge);
+               cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+               if (!cmd) {
+                       rc = -ENOMEM;
+                       goto out_unlock;
+               }
+
+               /* Fill common fields */
+               cmd->wr_id = wr->wr_id;
+               cmd->num_sge = wr->num_sge;
+               cmd->send_flags = wr->send_flags;
+               cmd->opcode = wr->opcode;
+
+               /* Immediate data and invalidation key */
+               if (wr->send_flags & IB_SEND_INLINE) {
+                       /* TODO: Check max_inline_data limit */
+                       vrdma_copy_inline_data_to_wqe(cmd, wr);
+               } else {
+                       memcpy(cmd->sg_list, wr->sg_list, sgl_size);
+               }
+
+               /* Handle immediate data (SEND_WITH_IMM, WRITE_WITH_IMM) */
+               if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+                   wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+                       cmd->ex.imm_data = wr->ex.imm_data;
+
+               /* Handle invalidate key (SEND_WITH_INV, LOCAL_INV) */
+               if (wr->opcode == IB_WR_SEND_WITH_INV ||
+                   wr->opcode == IB_WR_LOCAL_INV)
+                       cmd->ex.invalidate_rkey = wr->ex.invalidate_rkey;
+
+               /* RDMA and Atomic specific fields */
+               switch (ibqp->qp_type) {
+               case IB_QPT_RC:
+                       switch (wr->opcode) {
+                       case IB_WR_RDMA_READ:
+                       case IB_WR_RDMA_WRITE:
+                       case IB_WR_RDMA_WRITE_WITH_IMM:
+                               cmd->wr.rdma.remote_addr = 
rdma_wr(wr)->remote_addr;
+                               cmd->wr.rdma.rkey = rdma_wr(wr)->rkey;
+                               break;
+
+                       case IB_WR_ATOMIC_CMP_AND_SWP:
+                       case IB_WR_ATOMIC_FETCH_AND_ADD:
+                               cmd->wr.atomic.remote_addr = 
atomic_wr(wr)->remote_addr;
+                               cmd->wr.atomic.rkey = atomic_wr(wr)->rkey;
+                               cmd->wr.atomic.compare_add = 
atomic_wr(wr)->compare_add;
+                               if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP)
+                                       cmd->wr.atomic.swap = 
atomic_wr(wr)->swap;
+                               break;
+
+                       case IB_WR_REG_MR: {
+                               const struct ib_reg_wr *reg = reg_wr(wr);
+                               struct vrdma_mr *vmr = to_vmr(reg->mr);
+                               cmd->wr.reg.mrn = vmr->mr_handle;
+                               cmd->wr.reg.key = reg->key;
+                               cmd->wr.reg.access = reg->access;
+                               break;
+                       }
+                       default:
+                               break;
+                       }
+                       break;
+
+               case IB_QPT_UD:
+               case IB_QPT_GSI: {
+                       if (!ud_wr(wr)->ah) {
+                               pr_warn("vRDMA: invalid address handle in UD 
WR\n");
+                               kfree(cmd);
+                               rc = -EINVAL;
+                               goto out_unlock;
+                       }
+                       cmd->wr.ud.remote_qpn = ud_wr(wr)->remote_qpn;
+                       cmd->wr.ud.remote_qkey = ud_wr(wr)->remote_qkey;
+                       cmd->wr.ud.av = to_vah(ud_wr(wr)->ah)->av;
+                       break;
+               }
+
+               default:
+                       pr_err("vRDMA: unsupported QP type %d\n", 
ibqp->qp_type);
+                       kfree(cmd);
+                       rc = -EINVAL;
+                       goto out_unlock;
+               }
+
+               /* Prepare scatterlist for virtqueue */
+               sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+               sgs[0] = &hdr;
+
+               rc = virtqueue_add_sgs(vqp->sq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+               if (rc) {
+                       dev_err(&vqp->sq->vq->vdev->dev,
+                               "vRDMA: failed to add send WR to vq: %d\n", rc);
+                       kfree(cmd);
+                       goto out_unlock;
+               }
+
+               /* Advance to next WR */
+               wr = wr->next;
+       }
+
+out_unlock:
+       spin_unlock(&vqp->sq->lock);
+
+       /* Only kick after successful submission(s), but always try to kick */
+       virtqueue_kick(vqp->sq->vq);
+       return rc;
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
        .owner = THIS_MODULE,
        .uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -2246,7 +2552,9 @@ static const struct ib_device_ops vrdma_dev_ops = {
        .mmap_free = vrdma_mmap_free,
        .modify_port = vrdma_modify_port,
        .modify_qp = vrdma_modify_qp,
-       .poll_cq = vrdma_poll_cq,                       
+       .poll_cq = vrdma_poll_cq,
+       .post_recv = vrdma_post_recv,
+       .post_send = vrdma_post_send,                   
 };
 
 /**
-- 
2.43.0

Reply via email to