The branch main has been updated by imp:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=1bce7cd885e7e5b376a60367629a0f76ff7f0167

commit 1bce7cd885e7e5b376a60367629a0f76ff7f0167
Author:     Warner Losh <i...@freebsd.org>
AuthorDate: 2024-06-14 22:40:08 +0000
Commit:     Warner Losh <i...@freebsd.org>
CommitDate: 2024-06-14 22:40:08 +0000

    nvme: Add Linux copatible ioctls
    
    Add the NVME_IOCTL_ID, NVME_IOCTL_ADMIN_CMD, and NVME_IOCTL_IO_CMD Linux
    compatible ioctls. These may be run on either an I/O (ns) dev or a nvme
    (admin) dev. Linux allows both on either device, and programs use this
    and aren't careful about having the right device open. Emulate this
    feature, and implement these ioctls. The data is passed in into the
    kernel in host byte order (not converted to le). Results are returned in
    host order.
    
    The timeout field is ignore, and the metadata and metadata_len fields
    must be zero.
    
    The addr field can be null, even when the data_len is non zero (FreeBSD's
    ioctl interface prohibits this, Linux's just ignores the inconsistency).
    
    Only the cdw10 is returned from the command: the status is not returned
    in 'result' field. XXX need to verify that this is what Linux does on an
    error signaled from the drive.
    
    No external include file is yet available for this: most programs that
    call this interface either use a linux-specific path <linux/nvme.h> or
    have their own private copy of the data. It's unclear the best thing to
    do.
    
    Also, create a /dev/nvmeXnY as an alias for /dev/nvmeXnsY.
    
    These changes allow a native build of nvme-cli to work for everything
    that doesn't depend on sysfs entries in /sys, calls that use metadata,
    send / receive drive data and sed functionality not in our nvme driver.
    
    Sponsored by:           Netflix
    Co-Authored-by:         Chuck Tuffli <ch...@freebsd.org>
    Reviewed by:            chuck
    Differential Revision:  https://reviews.freebsd.org/D45415
---
 sys/dev/nvme/nvme.h       |   6 +++
 sys/dev/nvme/nvme_ctrlr.c | 114 +++++++++++++++++++++++++++++++++++++++++++++-
 sys/dev/nvme/nvme_linux.h |  58 +++++++++++++++++++++++
 sys/dev/nvme/nvme_ns.c    |  14 +++++-
 4 files changed, 190 insertions(+), 2 deletions(-)

diff --git a/sys/dev/nvme/nvme.h b/sys/dev/nvme/nvme.h
index a389fc443743..1db50d24c259 100644
--- a/sys/dev/nvme/nvme.h
+++ b/sys/dev/nvme/nvme.h
@@ -1902,6 +1902,7 @@ struct thread;
 struct nvme_namespace;
 struct nvme_controller;
 struct nvme_consumer;
+struct nvme_passthru_cmd;
 
 typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
 
@@ -1921,6 +1922,11 @@ int      nvme_ctrlr_passthrough_cmd(struct 
nvme_controller *ctrlr,
                                   uint32_t nsid, int is_user_buffer,
                                   int is_admin_cmd);
 
+int    nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+                                     struct nvme_passthru_cmd *npc,
+                                     uint32_t nsid, bool is_user,
+                                     bool is_admin);
+
 /* Admin functions */
 void   nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
                                   uint8_t feature, uint32_t cdw11,
diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c
index 155aedf2f31a..f058a4e33b9f 100644
--- a/sys/dev/nvme/nvme_ctrlr.c
+++ b/sys/dev/nvme/nvme_ctrlr.c
@@ -43,6 +43,7 @@
 #include <vm/vm.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 #define B4_CHK_RDY_DELAY_MS    2300            /* work around controller bug */
 
@@ -1269,7 +1270,7 @@ nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
                                ret = EFAULT;
                                goto err;
                        }
-                       req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
+                       req = nvme_allocate_request_vaddr(buf->b_data, pt->len,
                            nvme_pt_done, pt);
                } else
                        req = nvme_allocate_request_vaddr(pt->buf, pt->len,
@@ -1314,6 +1315,103 @@ err:
        return (ret);
 }
 
+static void
+nvme_npc_done(void *arg, const struct nvme_completion *cpl)
+{
+       struct nvme_passthru_cmd *npc = arg;
+       struct mtx *mtx = (void *)(uintptr_t)npc->metadata;
+
+       npc->result = cpl->cdw0;        /* cpl in host order by now */
+       mtx_lock(mtx);
+       npc->metadata = 0;
+       wakeup(npc);
+       mtx_unlock(mtx);
+}
+
+/* XXX refactor? */
+
+int
+nvme_ctrlr_linux_passthru_cmd(struct nvme_controller *ctrlr,
+    struct nvme_passthru_cmd *npc, uint32_t nsid, bool is_user, bool is_admin)
+{
+       struct nvme_request     *req;
+       struct mtx              *mtx;
+       struct buf              *buf = NULL;
+       int                     ret = 0;
+
+       /*
+        * We don't support metadata.
+        */
+       if (npc->metadata != 0 || npc->metadata_len != 0)
+               return (EIO);
+
+       if (npc->data_len > 0 && npc->addr != 0) {
+               if (npc->data_len > ctrlr->max_xfer_size) {
+                       nvme_printf(ctrlr,
+                           "npc->data_len (%d) exceeds max_xfer_size (%d)\n",
+                           npc->data_len, ctrlr->max_xfer_size);
+                       return (EIO);
+               }
+               /* We only support data out or data in commands, but not both 
at once. */
+               if ((npc->opcode & 0x3) == 0 || (npc->opcode & 0x3) == 3)
+                       return (EINVAL);
+               if (is_user) {
+                       /*
+                        * Ensure the user buffer is wired for the duration of
+                        *  this pass-through command.
+                        */
+                       PHOLD(curproc);
+                       buf = uma_zalloc(pbuf_zone, M_WAITOK);
+                       buf->b_iocmd = npc->opcode & 1 ? BIO_WRITE : BIO_READ;
+                       if (vmapbuf(buf, (void *)npc->addr, npc->data_len, 1) < 
0) {
+                               ret = EFAULT;
+                               goto err;
+                       }
+                       req = nvme_allocate_request_vaddr(buf->b_data, 
npc->data_len,
+                           nvme_npc_done, npc);
+               } else
+                       req = nvme_allocate_request_vaddr((void *)npc->addr, 
npc->data_len,
+                           nvme_npc_done, npc);
+       } else
+               req = nvme_allocate_request_null(nvme_npc_done, npc);
+
+       req->cmd.opc = npc->opcode;
+       req->cmd.fuse = npc->flags;
+       req->cmd.rsvd2 = htole16(npc->cdw2);
+       req->cmd.rsvd3 = htole16(npc->cdw3);
+       req->cmd.cdw10 = htole32(npc->cdw10);
+       req->cmd.cdw11 = htole32(npc->cdw11);
+       req->cmd.cdw12 = htole32(npc->cdw12);
+       req->cmd.cdw13 = htole32(npc->cdw13);
+       req->cmd.cdw14 = htole32(npc->cdw14);
+       req->cmd.cdw15 = htole32(npc->cdw15);
+
+       req->cmd.nsid = htole32(nsid);
+
+       mtx = mtx_pool_find(mtxpool_sleep, npc);
+       npc->metadata = (uintptr_t) mtx;
+
+       /* XXX no timeout passed down */
+       if (is_admin)
+               nvme_ctrlr_submit_admin_request(ctrlr, req);
+       else
+               nvme_ctrlr_submit_io_request(ctrlr, req);
+
+       mtx_lock(mtx);
+       while (npc->metadata != 0)
+               mtx_sleep(npc, mtx, PRIBIO, "nvme_npc", 0);
+       mtx_unlock(mtx);
+
+       if (buf != NULL) {
+               vunmapbuf(buf);
+err:
+               uma_zfree(pbuf_zone, buf);
+               PRELE(curproc);
+       }
+
+       return (ret);
+}
+
 static int
 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
     struct thread *td)
@@ -1324,6 +1422,7 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t 
arg, int flag,
        ctrlr = cdev->si_drv1;
 
        switch (cmd) {
+       case NVME_IOCTL_RESET: /* Linux compat */
        case NVME_RESET_CONTROLLER:
                nvme_ctrlr_reset(ctrlr);
                break;
@@ -1342,6 +1441,19 @@ nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t 
arg, int flag,
        case NVME_GET_MAX_XFER_SIZE:
                *(uint64_t *)arg = ctrlr->max_xfer_size;
                break;
+       /* Linux Compatible (see nvme_linux.h) */
+       case NVME_IOCTL_ID:
+               td->td_retval[0] = 0xfffffffful;
+               return (0);
+
+       case NVME_IOCTL_ADMIN_CMD:
+       case NVME_IOCTL_IO_CMD: {
+               struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+               return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, npc->nsid, 
true,
+                   cmd == NVME_IOCTL_ADMIN_CMD));
+       }
+
        default:
                return (ENOTTY);
        }
diff --git a/sys/dev/nvme/nvme_linux.h b/sys/dev/nvme/nvme_linux.h
new file mode 100644
index 000000000000..aaa68e1d34f8
--- /dev/null
+++ b/sys/dev/nvme/nvme_linux.h
@@ -0,0 +1,58 @@
+/*-
+ * Copyright (c) 2024, Netflix Inc.
+ * Written by Warner Losh
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+/*
+ * Linux compatible NVME ioctls. So far we just support ID, ADMIN_CMD and
+ * IO_CMD. The rest are not supported.
+ */
+
+
+#include <sys/ioccom.h>
+#include <sys/_types.h>
+
+struct nvme_passthru_cmd {
+       __uint8_t       opcode;
+       __uint8_t       flags;
+       __uint16_t      rsvd1;
+       __uint32_t      nsid;
+       __uint32_t      cdw2;
+       __uint32_t      cdw3;
+       __uint64_t      metadata;
+       __uint64_t      addr;
+       __uint32_t      metadata_len;
+       __uint32_t      data_len;
+       __uint32_t      cdw10;
+       __uint32_t      cdw11;
+       __uint32_t      cdw12;
+       __uint32_t      cdw13;
+       __uint32_t      cdw14;
+       __uint32_t      cdw15;
+       __uint32_t      timeout_ms;
+       __uint32_t      result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+/*
+ * Linux nvme ioctls, commented out ones are not supported
+ */
+#define NVME_IOCTL_ID          _IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD   _IOWR('N', 0x41, struct nvme_admin_cmd)
+/* #define NVME_IOCTL_SUBMIT_IO        _IOW('N', 0x42, struct nvme_user_io) */
+#define NVME_IOCTL_IO_CMD      _IOWR('N', 0x43, struct nvme_passthru_cmd)
+#define NVME_IOCTL_RESET       _IO('N', 0x44)
+/* #define NVME_IOCTL_SUBSYS_RESET     _IO('N', 0x45) */
+/* #define NVME_IOCTL_RESCAN   _IO('N', 0x46) */
+/* #define NVME_IOCTL_ADMIN64_CMD      _IOWR('N', 0x47, struct 
nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD _IOWR('N', 0x48, struct nvme_passthru_cmd64) */
+/* #define NVME_IOCTL_IO64_CMD_VEC     _IOWR('N', 0x49, struct 
nvme_passthru_cmd64) */
+
+/* io_uring async commands: */
+/* #define NVME_URING_CMD_IO   _IOWR('N', 0x80, struct nvme_uring_cmd) */
+/* #define NVME_URING_CMD_IO_VEC       _IOWR('N', 0x81, struct nvme_uring_cmd) 
*/
+/* #define NVME_URING_CMD_ADMIN        _IOWR('N', 0x82, struct nvme_uring_cmd) 
*/
+/* #define NVME_URING_CMD_ADMIN_VEC _IOWR('N', 0x83, struct nvme_uring_cmd) */
diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c
index 4c65e2c49e64..3f29382fe42f 100644
--- a/sys/dev/nvme/nvme_ns.c
+++ b/sys/dev/nvme/nvme_ns.c
@@ -43,6 +43,7 @@
 #include <geom/geom.h>
 
 #include "nvme_private.h"
+#include "nvme_linux.h"
 
 static void            nvme_bio_child_inbed(struct bio *parent, int bio_error);
 static void            nvme_bio_child_done(void *arg,
@@ -93,6 +94,18 @@ nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, 
int flag,
        case DIOCGSECTORSIZE:
                *(u_int *)arg = nvme_ns_get_sector_size(ns);
                break;
+       /* Linux Compatible (see nvme_linux.h) */
+       case NVME_IOCTL_ID:
+               td->td_retval[0] = ns->id;
+               return (0);
+
+       case NVME_IOCTL_ADMIN_CMD:
+       case NVME_IOCTL_IO_CMD: {
+               struct nvme_passthru_cmd *npc = (struct nvme_passthru_cmd *)arg;
+
+               return (nvme_ctrlr_linux_passthru_cmd(ctrlr, npc, ns->id, true,
+                   cmd == NVME_IOCTL_ADMIN_CMD));
+       }
        default:
                return (ENOTTY);
        }
@@ -610,7 +623,6 @@ nvme_ns_construct(struct nvme_namespace *ns, uint32_t id,
                return (ENXIO);
        ns->cdev->si_drv2 = make_dev_alias(ns->cdev, "%sns%d",
            device_get_nameunit(ctrlr->dev), ns->id);
-
        ns->cdev->si_flags |= SI_UNMAPPED;
 
        return (0);

Reply via email to