Implement a new virtual PCI driver based on the VFIO framework.
This driver allows users to pass through PCI devices to UML via
VFIO. Currently, only MSI-X capable devices are supported, and
it is assumed that drivers will use MSI-X.

Signed-off-by: Tiwei Bie <tiwei....@antgroup.com>
---
 arch/um/drivers/Kconfig     |   8 +
 arch/um/drivers/Makefile    |   2 +
 arch/um/drivers/vfio_kern.c | 648 ++++++++++++++++++++++++++++++++++++
 arch/um/drivers/vfio_user.c | 323 ++++++++++++++++++
 arch/um/drivers/vfio_user.h |  44 +++
 5 files changed, 1025 insertions(+)
 create mode 100644 arch/um/drivers/vfio_kern.c
 create mode 100644 arch/um/drivers/vfio_user.c
 create mode 100644 arch/um/drivers/vfio_user.h

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 9cb196070614..d7bb447ff958 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -367,3 +367,11 @@ config UML_PCI_OVER_VIRTIO_DEVICE_ID
          There's no official device ID assigned (yet), set the one you
          wish to use for experimentation here. The default of -1 is
          not valid and will cause the driver to fail at probe.
+
+config UML_PCI_OVER_VFIO
+       bool "Enable VFIO-based PCI passthrough"
+       select UML_PCI
+       help
+         This driver provides support for VFIO-based PCI passthrough.
+         Currently, only MSI-X capable devices are supported, and it
+         is assumed that drivers will use MSI-X.
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 0a5820343ad3..336be56b8975 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -19,6 +19,7 @@ port-objs := port_kern.o port_user.o
 harddog-objs := harddog_kern.o
 harddog-builtin-$(CONFIG_UML_WATCHDOG) := harddog_user.o harddog_user_exp.o
 rtc-objs := rtc_kern.o rtc_user.o
+vfio_uml-objs := vfio_kern.o vfio_user.o
 
 LDFLAGS_vde.o = $(shell $(CC) $(CFLAGS) -print-file-name=libvdeplug.a)
 
@@ -62,6 +63,7 @@ obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o
 obj-$(CONFIG_UML_RTC) += rtc.o
 obj-$(CONFIG_UML_PCI) += virt-pci.o
 obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
+obj-$(CONFIG_UML_PCI_OVER_VFIO) += vfio_uml.o
 
 # pcap_user.o must be added explicitly.
 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o 
vector_user.o
diff --git a/arch/um/drivers/vfio_kern.c b/arch/um/drivers/vfio_kern.c
new file mode 100644
index 000000000000..805f589a568d
--- /dev/null
+++ b/arch/um/drivers/vfio_kern.c
@@ -0,0 +1,648 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei....@antgroup.com>
+ */
+#include <linux/module.h>
+#include <linux/logic_iomem.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/unaligned.h>
+#include <irq_kern.h>
+#include <init.h>
+#include <os.h>
+
+#include "virt-pci.h"
+#include "vfio_user.h"
+
+#define MAX_GROUPS     8
+#define MAX_DEVICES    8
+
+#define to_vdev(_pdev) container_of(_pdev, struct uml_vfio_device, pdev)
+
+struct uml_vfio_intr_ctx {
+       struct uml_vfio_device *dev;
+       int irq;
+};
+
+struct uml_vfio_device {
+       const char *name;
+       int group;
+
+       struct um_pci_device pdev;
+       struct uml_vfio_user_device udev;
+       struct uml_vfio_intr_ctx *intr_ctx;
+
+       int msix_cap;
+       int msix_bar;
+       int msix_offset;
+       int msix_size;
+       u32 *msix_data;
+};
+
+static struct {
+       int fd;
+       int users;
+} uml_vfio_container;
+static DEFINE_MUTEX(uml_vfio_container_mtx);
+
+static struct {
+       int id;
+       int fd;
+       int users;
+} uml_vfio_groups[MAX_GROUPS];
+static DEFINE_MUTEX(uml_vfio_groups_mtx);
+
+static struct uml_vfio_device *uml_vfio_devices[MAX_DEVICES];
+
+static int uml_vfio_open_container(void)
+{
+       int fd;
+
+       fd = uml_vfio_user_open_container();
+       if (fd < 0)
+               return fd;
+
+       uml_vfio_container.fd = fd;
+       return 0;
+}
+
+static void uml_vfio_release_container(void)
+{
+       os_close_file(uml_vfio_container.fd);
+}
+
+static int uml_vfio_set_container(int group_fd)
+{
+       int err;
+
+       guard(mutex)(&uml_vfio_container_mtx);
+
+       err = uml_vfio_user_set_container(uml_vfio_container.fd, group_fd);
+       if (err)
+               return err;
+
+       uml_vfio_container.users++;
+       if (uml_vfio_container.users > 1)
+               return 0;
+
+       err = uml_vfio_user_setup_iommu(uml_vfio_container.fd);
+       if (err) {
+               uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+               uml_vfio_container.users--;
+       }
+       return err;
+}
+
+static void uml_vfio_unset_container(int group_fd)
+{
+       guard(mutex)(&uml_vfio_container_mtx);
+
+       uml_vfio_user_unset_container(uml_vfio_container.fd, group_fd);
+       uml_vfio_container.users--;
+}
+
+static int uml_vfio_open_group(int group_id)
+{
+       int free = -1, err, fd, i;
+
+       guard(mutex)(&uml_vfio_groups_mtx);
+
+       for (i = 0; i < MAX_GROUPS; i++) {
+               if (uml_vfio_groups[i].users > 0 &&
+                               uml_vfio_groups[i].id == group_id) {
+                       uml_vfio_groups[i].users++;
+                       return uml_vfio_groups[i].fd;
+               }
+       }
+
+       for (i = 0; i < MAX_GROUPS; i++) {
+               if (uml_vfio_groups[i].users == 0) {
+                       free = i;
+                       break;
+               }
+       }
+
+       if (free < 0)
+               return -ENOSPC;
+
+       fd = uml_vfio_user_open_group(group_id);
+       if (fd < 0)
+               return fd;
+
+       err = uml_vfio_set_container(fd);
+       if (err) {
+               os_close_file(fd);
+               return err;
+       }
+
+       uml_vfio_groups[free].id = group_id;
+       uml_vfio_groups[free].fd = fd;
+       uml_vfio_groups[free].users = 1;
+
+       return fd;
+}
+
+static int uml_vfio_release_group(int group_fd)
+{
+       int i;
+
+       guard(mutex)(&uml_vfio_groups_mtx);
+
+       for (i = 0; i < MAX_GROUPS; i++) {
+               if (uml_vfio_groups[i].users > 0 &&
+                               uml_vfio_groups[i].fd == group_fd)
+                       break;
+       }
+
+       if (i == MAX_GROUPS)
+               return -ENOENT;
+
+       uml_vfio_groups[i].users--;
+       if (uml_vfio_groups[i].users > 0)
+               return 0;
+
+       uml_vfio_unset_container(group_fd);
+       os_close_file(group_fd);
+       return 0;
+}
+
+static irqreturn_t uml_vfio_interrupt(int unused, void *opaque)
+{
+       struct uml_vfio_intr_ctx *ctx = opaque;
+       struct uml_vfio_device *dev = ctx->dev;
+       int index = ctx - dev->intr_ctx;
+       int irqfd = dev->udev.irqfd[index];
+       int irq = dev->msix_data[index];
+       uint64_t v;
+       int r;
+
+       do {
+               r = os_read_file(irqfd, &v, sizeof(v));
+               if (r == sizeof(v))
+                       generic_handle_irq(irq);
+       } while (r == sizeof(v) || r == -EINTR);
+       WARN(r != -EAGAIN, "read returned %d\n", r);
+
+       return IRQ_HANDLED;
+}
+
+static int uml_vfio_activate_irq(struct uml_vfio_device *dev, int index)
+{
+       struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+       int err, irqfd;
+
+       if (ctx->irq >= 0)
+               return 0;
+
+       irqfd = uml_vfio_user_activate_irq(&dev->udev, index);
+       if (irqfd < 0)
+               return irqfd;
+
+       ctx->irq = um_request_irq(UM_IRQ_ALLOC, irqfd, IRQ_READ,
+                                 uml_vfio_interrupt, IRQF_SHARED,
+                                 "vfio-uml", ctx);
+       if (ctx->irq < 0) {
+               err = ctx->irq;
+               goto deactivate;
+       }
+
+       err = add_sigio_fd(irqfd);
+       if (err)
+               goto free_irq;
+
+       return 0;
+
+free_irq:
+       um_free_irq(ctx->irq, ctx);
+       ctx->irq = -1;
+deactivate:
+       uml_vfio_user_deactivate_irq(&dev->udev, index);
+       return err;
+}
+
+static int uml_vfio_deactivate_irq(struct uml_vfio_device *dev, int index)
+{
+       struct uml_vfio_intr_ctx *ctx = &dev->intr_ctx[index];
+
+       if (ctx->irq >= 0) {
+               ignore_sigio_fd(dev->udev.irqfd[index]);
+               um_free_irq(ctx->irq, ctx);
+               uml_vfio_user_deactivate_irq(&dev->udev, index);
+               ctx->irq = -1;
+       }
+       return 0;
+}
+
+static int uml_vfio_update_msix_cap(struct uml_vfio_device *dev,
+                                   unsigned int offset, int size,
+                                   unsigned long val)
+{
+       int err = 0;
+
+       if (size == 2 && offset == dev->msix_cap + PCI_MSIX_FLAGS) {
+               switch (val & ~PCI_MSIX_FLAGS_QSIZE) {
+               case PCI_MSIX_FLAGS_ENABLE:
+               case 0:
+                       err = uml_vfio_user_update_irqs(&dev->udev);
+                       break;
+               }
+       }
+
+       return err;
+}
+
+static int uml_vfio_update_msix_table(struct uml_vfio_device *dev,
+                                     unsigned int offset, int size,
+                                     unsigned long val)
+{
+       int index;
+
+       offset -= dev->msix_offset + PCI_MSIX_ENTRY_DATA;
+
+       if (size != 4 || offset % PCI_MSIX_ENTRY_SIZE != 0)
+               return 0;
+
+       index = offset / PCI_MSIX_ENTRY_SIZE;
+       if (index >= dev->udev.irq_count)
+               return -EINVAL;
+
+       dev->msix_data[index] = val;
+
+       return val ? uml_vfio_activate_irq(dev, index) :
+               uml_vfio_deactivate_irq(dev, index);
+}
+
+static unsigned long __uml_vfio_cfgspace_read(struct uml_vfio_device *dev,
+                                             unsigned int offset, int size)
+{
+       u8 data[8];
+
+       memset(data, 0xff, sizeof(data));
+
+       if (uml_vfio_user_cfgspace_read(&dev->udev, offset, data, size))
+               return ULONG_MAX;
+
+       switch (size) {
+       case 1:
+               return data[0];
+       case 2:
+               return le16_to_cpup((void *)data);
+       case 4:
+               return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+       case 8:
+               return le64_to_cpup((void *)data);
+#endif
+       default:
+               return ULONG_MAX;
+       }
+}
+
+static unsigned long uml_vfio_cfgspace_read(struct um_pci_device *pdev,
+                                           unsigned int offset, int size)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+
+       return __uml_vfio_cfgspace_read(dev, offset, size);
+}
+
+static void __uml_vfio_cfgspace_write(struct uml_vfio_device *dev,
+                                     unsigned int offset, int size,
+                                     unsigned long val)
+{
+       u8 data[8];
+
+       switch (size) {
+       case 1:
+               data[0] = (u8)val;
+               break;
+       case 2:
+               put_unaligned_le16(val, (void *)data);
+               break;
+       case 4:
+               put_unaligned_le32(val, (void *)data);
+               break;
+#ifdef CONFIG_64BIT
+       case 8:
+               put_unaligned_le64(val, (void *)data);
+               break;
+#endif
+       }
+
+       WARN_ON(uml_vfio_user_cfgspace_write(&dev->udev, offset, data, size));
+}
+
+static void uml_vfio_cfgspace_write(struct um_pci_device *pdev,
+                                   unsigned int offset, int size,
+                                   unsigned long val)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+
+       if (offset < dev->msix_cap + PCI_CAP_MSIX_SIZEOF &&
+                       offset + size > dev->msix_cap)
+               WARN_ON(uml_vfio_update_msix_cap(dev, offset, size, val));
+
+       __uml_vfio_cfgspace_write(dev, offset, size, val);
+}
+
+static void uml_vfio_bar_copy_from(struct um_pci_device *pdev, int bar,
+                                  void *buffer, unsigned int offset, int size)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+
+       memset(buffer, 0xff, size);
+       uml_vfio_user_bar_read(&dev->udev, bar, offset, buffer, size);
+}
+
+static unsigned long uml_vfio_bar_read(struct um_pci_device *pdev, int bar,
+                                      unsigned int offset, int size)
+{
+       u8 data[8];
+
+       uml_vfio_bar_copy_from(pdev, bar, data, offset, size);
+
+       switch (size) {
+       case 1:
+               return data[0];
+       case 2:
+               return le16_to_cpup((void *)data);
+       case 4:
+               return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+       case 8:
+               return le64_to_cpup((void *)data);
+#endif
+       default:
+               return ULONG_MAX;
+       }
+}
+
+static void uml_vfio_bar_copy_to(struct um_pci_device *pdev, int bar,
+                                unsigned int offset, const void *buffer,
+                                int size)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+
+       uml_vfio_user_bar_write(&dev->udev, bar, offset, buffer, size);
+}
+
+static void uml_vfio_bar_write(struct um_pci_device *pdev, int bar,
+                              unsigned int offset, int size,
+                              unsigned long val)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+       u8 data[8];
+
+       if (bar == dev->msix_bar && offset + size > dev->msix_offset &&
+                       offset < dev->msix_offset + dev->msix_size)
+               WARN_ON(uml_vfio_update_msix_table(dev, offset, size, val));
+
+       switch (size) {
+       case 1:
+               data[0] = (u8)val;
+               break;
+       case 2:
+               put_unaligned_le16(val, (void *)data);
+               break;
+       case 4:
+               put_unaligned_le32(val, (void *)data);
+               break;
+#ifdef CONFIG_64BIT
+       case 8:
+               put_unaligned_le64(val, (void *)data);
+               break;
+#endif
+       }
+
+       uml_vfio_bar_copy_to(pdev, bar, offset, data, size);
+}
+
+static void uml_vfio_bar_set(struct um_pci_device *pdev, int bar,
+                            unsigned int offset, u8 value, int size)
+{
+       struct uml_vfio_device *dev = to_vdev(pdev);
+       int i;
+
+       for (i = 0; i < size; i++)
+               uml_vfio_user_bar_write(&dev->udev, bar, offset + i, &value, 1);
+}
+
+static const struct um_pci_ops uml_vfio_um_pci_ops = {
+       .cfgspace_read  = uml_vfio_cfgspace_read,
+       .cfgspace_write = uml_vfio_cfgspace_write,
+       .bar_read       = uml_vfio_bar_read,
+       .bar_write      = uml_vfio_bar_write,
+       .bar_copy_from  = uml_vfio_bar_copy_from,
+       .bar_copy_to    = uml_vfio_bar_copy_to,
+       .bar_set        = uml_vfio_bar_set,
+};
+
+static u8 uml_vfio_find_capability(struct uml_vfio_device *dev, u8 cap)
+{
+       u8 id, pos;
+       u16 ent;
+       int ttl = 48;
+
+       pos = __uml_vfio_cfgspace_read(dev, PCI_CAPABILITY_LIST, sizeof(pos));
+
+       while (pos && ttl--) {
+               ent = __uml_vfio_cfgspace_read(dev, pos, sizeof(ent));
+
+               id = ent & 0xff;
+               if (id == 0xff)
+                       break;
+               if (id == cap)
+                       return pos;
+
+               pos = ent >> 8;
+       }
+
+       return 0;
+}
+
+static int uml_vfio_read_msix_table(struct uml_vfio_device *dev)
+{
+       unsigned int off;
+       u16 flags;
+       u32 tbl;
+
+       off = uml_vfio_find_capability(dev, PCI_CAP_ID_MSIX);
+       if (!off)
+               return -ENOTSUPP;
+
+       dev->msix_cap = off;
+
+       tbl = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_TABLE, sizeof(tbl));
+       flags = __uml_vfio_cfgspace_read(dev, off + PCI_MSIX_FLAGS, 
sizeof(flags));
+
+       dev->msix_bar = tbl & PCI_MSIX_TABLE_BIR;
+       dev->msix_offset = tbl & PCI_MSIX_TABLE_OFFSET;
+       dev->msix_size = ((flags & PCI_MSIX_FLAGS_QSIZE) + 1) * 
PCI_MSIX_ENTRY_SIZE;
+
+       dev->msix_data = kzalloc(dev->msix_size, GFP_KERNEL);
+       if (!dev->msix_data)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int uml_vfio_open_device(struct uml_vfio_device *dev)
+{
+       struct uml_vfio_intr_ctx *ctx;
+       int err, group_id, i;
+
+       group_id = uml_vfio_user_get_group_id(dev->name);
+       if (group_id < 0)
+               return group_id;
+
+       dev->group = uml_vfio_open_group(group_id);
+       if (dev->group < 0)
+               return dev->group;
+
+       err = uml_vfio_user_setup_device(&dev->udev, dev->group, dev->name);
+       if (err)
+               goto release_group;
+
+       err = uml_vfio_read_msix_table(dev);
+       if (err)
+               goto teardown_udev;
+
+       dev->intr_ctx = kmalloc_array(dev->udev.irq_count,
+                                     sizeof(struct uml_vfio_intr_ctx),
+                                     GFP_KERNEL);
+       if (!dev->intr_ctx) {
+               err = -ENOMEM;
+               goto free_msix;
+       }
+
+       for (i = 0; i < dev->udev.irq_count; i++) {
+               ctx = &dev->intr_ctx[i];
+               ctx->dev = dev;
+               ctx->irq = -1;
+       }
+
+       dev->pdev.ops = &uml_vfio_um_pci_ops;
+
+       err = um_pci_device_register(&dev->pdev);
+       if (err)
+               goto free_intr_ctx;
+
+       return 0;
+
+free_intr_ctx:
+       kfree(dev->intr_ctx);
+free_msix:
+       kfree(dev->msix_data);
+teardown_udev:
+       uml_vfio_user_teardown_device(&dev->udev);
+release_group:
+       uml_vfio_release_group(dev->group);
+       return err;
+}
+
+static void uml_vfio_release_device(struct uml_vfio_device *dev)
+{
+       int i;
+
+       for (i = 0; i < dev->udev.irq_count; i++)
+               uml_vfio_deactivate_irq(dev, i);
+
+       uml_vfio_user_update_irqs(&dev->udev);
+       um_pci_device_unregister(&dev->pdev);
+       kfree(dev->intr_ctx);
+       kfree(dev->msix_data);
+       uml_vfio_user_teardown_device(&dev->udev);
+       uml_vfio_release_group(dev->group);
+       kfree(dev->name);
+       kfree(dev);
+}
+
+static int uml_vfio_cmdline_set(const char *device, const struct kernel_param 
*kp)
+{
+       struct uml_vfio_device *dev;
+       int free = -1, i;
+
+       for (i = 0; i < MAX_DEVICES; i++) {
+               if (uml_vfio_devices[i])
+                       continue;
+               free = i;
+               break;
+       }
+
+       if (free < 0)
+               return -ENOSPC;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+
+       dev->name = kstrdup(device, GFP_KERNEL);
+       if (!dev->name) {
+               kfree(dev);
+               return -ENOMEM;
+       }
+
+       uml_vfio_devices[free] = dev;
+       return 0;
+}
+
+static int uml_vfio_cmdline_get(char *buffer, const struct kernel_param *kp)
+{
+       return 0;
+}
+
+static const struct kernel_param_ops uml_vfio_cmdline_param_ops = {
+       .set = uml_vfio_cmdline_set,
+       .get = uml_vfio_cmdline_get,
+};
+
+device_param_cb(device, &uml_vfio_cmdline_param_ops, NULL, 0400);
+__uml_help(uml_vfio_cmdline_param_ops,
+"vfio_uml.device=<domain:bus:slot.function>\n"
+"    Pass through a PCI device to UML via VFIO. Currently, only MSI-X\n"
+"    capable devices are supported, and it is assumed that drivers will\n"
+"    use MSI-X. This parameter can be specified multiple times to pass\n"
+"    through multiple PCI devices to UML.\n\n"
+);
+
+static int __init uml_vfio_init(void)
+{
+       struct uml_vfio_device *dev;
+       int err, i;
+
+       sigio_broken();
+
+       err = uml_vfio_open_container();
+       if (err)
+               return err;
+
+       for (i = 0; i < MAX_DEVICES; i++) {
+               dev = uml_vfio_devices[i];
+               if (!dev)
+                       break;
+               err = uml_vfio_open_device(dev);
+               if (err)
+                       printk(KERN_ERR "uml_vfio open device (%s) failed, 
error %d\n",
+                              dev->name, err);
+       }
+
+       return 0;
+}
+late_initcall(uml_vfio_init);
+
+static void __exit uml_vfio_exit(void)
+{
+       struct uml_vfio_device *dev;
+       int i;
+
+       for (i = 0; i < MAX_DEVICES; i++) {
+               dev = uml_vfio_devices[i];
+               if (!dev)
+                       break;
+               uml_vfio_release_device(dev);
+       }
+
+       uml_vfio_release_container();
+}
+module_exit(uml_vfio_exit);
diff --git a/arch/um/drivers/vfio_user.c b/arch/um/drivers/vfio_user.c
new file mode 100644
index 000000000000..0a97e367fc59
--- /dev/null
+++ b/arch/um/drivers/vfio_user.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2025 Ant Group
+ * Author: Tiwei Bie <tiwei....@antgroup.com>
+ */
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include <linux/limits.h>
+#include <linux/vfio.h>
+#include <linux/pci_regs.h>
+#include <as-layout.h>
+#include <um_malloc.h>
+
+#include "vfio_user.h"
+
+int uml_vfio_user_open_container(void)
+{
+       int r, fd;
+
+       fd = open("/dev/vfio/vfio", O_RDWR);
+       if (fd < 0)
+               return -errno;
+
+       r = ioctl(fd, VFIO_GET_API_VERSION);
+       if (r != VFIO_API_VERSION) {
+               r = r < 0 ? -errno : -EINVAL;
+               goto error;
+       }
+
+       r = ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU);
+       if (r <= 0) {
+               r = r < 0 ? -errno : -EINVAL;
+               goto error;
+       }
+
+       return fd;
+
+error:
+       close(fd);
+       return r;
+}
+
+int uml_vfio_user_setup_iommu(int container)
+{
+       unsigned long reserved = uml_reserved - uml_physmem;
+       struct vfio_iommu_type1_dma_map dma_map = {
+               .argsz = sizeof(dma_map),
+               .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+               .vaddr = uml_reserved,
+               .iova = reserved,
+               .size = physmem_size - reserved,
+       };
+
+       if (ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU) < 0)
+               return -errno;
+
+       if (ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map) < 0)
+               return -errno;
+
+       return 0;
+}
+
+int uml_vfio_user_get_group_id(const char *device)
+{
+       char *path, *buf, *end;
+       const char *name;
+       int r;
+
+       path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL);
+       if (!path)
+               return -ENOMEM;
+
+       sprintf(path, "/sys/bus/pci/devices/%s/iommu_group", device);
+
+       buf = uml_kmalloc(PATH_MAX + 1, UM_GFP_KERNEL);
+       if (!buf) {
+               r = -ENOMEM;
+               goto free_path;
+       }
+
+       r = readlink(path, buf, PATH_MAX);
+       if (r < 0) {
+               r = -errno;
+               goto free_buf;
+       }
+       buf[r] = '\0';
+
+       name = basename(buf);
+
+       r = strtoul(name, &end, 10);
+       if (*end != '\0' || end == name) {
+               r = -EINVAL;
+               goto free_buf;
+       }
+
+free_buf:
+       kfree(buf);
+free_path:
+       kfree(path);
+       return r;
+}
+
+int uml_vfio_user_open_group(int group_id)
+{
+       char *path;
+       int fd;
+
+       path = uml_kmalloc(PATH_MAX, UM_GFP_KERNEL);
+       if (!path)
+               return -ENOMEM;
+
+       sprintf(path, "/dev/vfio/%d", group_id);
+
+       fd = open(path, O_RDWR);
+       if (fd < 0) {
+               fd = -errno;
+               goto out;
+       }
+
+out:
+       kfree(path);
+       return fd;
+}
+
+int uml_vfio_user_set_container(int container, int group)
+{
+       if (ioctl(group, VFIO_GROUP_SET_CONTAINER, &container) < 0)
+               return -errno;
+       return 0;
+}
+
+int uml_vfio_user_unset_container(int container, int group)
+{
+       if (ioctl(group, VFIO_GROUP_UNSET_CONTAINER, &container) < 0)
+               return -errno;
+       return 0;
+}
+
+static int vfio_set_irqs(int device, int start, int count, int *irqfd)
+{
+       struct vfio_irq_set *irq_set;
+       int argsz = sizeof(*irq_set) + sizeof(*irqfd) * count;
+       int err = 0;
+
+       irq_set = uml_kmalloc(argsz, UM_GFP_KERNEL);
+       if (!irq_set)
+               return -ENOMEM;
+
+       irq_set->argsz = argsz;
+       irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | 
VFIO_IRQ_SET_ACTION_TRIGGER;
+       irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+       irq_set->start = start;
+       irq_set->count = count;
+       memcpy(irq_set->data, irqfd, sizeof(*irqfd) * count);
+
+       if (ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set) < 0) {
+               err = -errno;
+               goto out;
+       }
+
+out:
+       kfree(irq_set);
+       return err;
+}
+
+int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev,
+                              int group, const char *device)
+{
+       struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+       struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+       int err, i;
+
+       dev->device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, device);
+       if (dev->device < 0)
+               return -errno;
+
+       if (ioctl(dev->device, VFIO_DEVICE_GET_INFO, &device_info) < 0) {
+               err = -errno;
+               goto close_device;
+       }
+
+       dev->num_regions = device_info.num_regions;
+       if (dev->num_regions > VFIO_PCI_CONFIG_REGION_INDEX + 1)
+               dev->num_regions = VFIO_PCI_CONFIG_REGION_INDEX + 1;
+
+       dev->region = uml_kmalloc(sizeof(*dev->region) * dev->num_regions,
+                                 UM_GFP_KERNEL);
+       if (!dev->region) {
+               err = -ENOMEM;
+               goto close_device;
+       }
+
+       for (i = 0; i < dev->num_regions; i++) {
+               struct vfio_region_info region = {
+                       .argsz = sizeof(region),
+                       .index = i,
+               };
+               if (ioctl(dev->device, VFIO_DEVICE_GET_REGION_INFO, &region) < 
0) {
+                       err = -errno;
+                       goto free_region;
+               }
+               dev->region[i].size = region.size;
+               dev->region[i].offset = region.offset;
+       }
+
+       /* Only MSI-X is supported currently. */
+       irq_info.index = VFIO_PCI_MSIX_IRQ_INDEX;
+       if (ioctl(dev->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info) < 0) {
+               err = -errno;
+               goto free_region;
+       }
+
+       dev->irq_count = irq_info.count;
+
+       dev->irqfd = uml_kmalloc(sizeof(int) * dev->irq_count, UM_GFP_KERNEL);
+       if (!dev->irqfd) {
+               err = -ENOMEM;
+               goto free_region;
+       }
+
+       memset(dev->irqfd, -1, sizeof(int) * dev->irq_count);
+
+       err = vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd);
+       if (err)
+               goto free_irqfd;
+
+       return 0;
+
+free_irqfd:
+       kfree(dev->irqfd);
+free_region:
+       kfree(dev->region);
+close_device:
+       close(dev->device);
+       return err;
+}
+
+void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev)
+{
+       kfree(dev->irqfd);
+       kfree(dev->region);
+       close(dev->device);
+}
+
+int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index)
+{
+       int irqfd;
+
+       irqfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+       if (irqfd < 0)
+               return -errno;
+
+       dev->irqfd[index] = irqfd;
+       return irqfd;
+}
+
+void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index)
+{
+       close(dev->irqfd[index]);
+       dev->irqfd[index] = -1;
+}
+
+int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev)
+{
+       return vfio_set_irqs(dev->device, 0, dev->irq_count, dev->irqfd);
+}
+
+static int vfio_region_read(struct uml_vfio_user_device *dev, unsigned int 
index,
+                           uint64_t offset, void *buf, uint64_t size)
+{
+       if (index >= dev->num_regions || offset + size > 
dev->region[index].size)
+               return -EINVAL;
+
+       if (pread(dev->device, buf, size, dev->region[index].offset + offset) < 
0)
+               return -errno;
+
+       return 0;
+}
+
+static int vfio_region_write(struct uml_vfio_user_device *dev, unsigned int 
index,
+                            uint64_t offset, const void *buf, uint64_t size)
+{
+       if (index >= dev->num_regions || offset + size > 
dev->region[index].size)
+               return -EINVAL;
+
+       if (pwrite(dev->device, buf, size, dev->region[index].offset + offset) 
< 0)
+               return -errno;
+
+       return 0;
+}
+
+int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev,
+                               unsigned int offset, void *buf, int size)
+{
+       return vfio_region_read(dev, VFIO_PCI_CONFIG_REGION_INDEX,
+                               offset, buf, size);
+}
+
+int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev,
+                                unsigned int offset, const void *buf, int size)
+{
+       return vfio_region_write(dev, VFIO_PCI_CONFIG_REGION_INDEX,
+                                offset, buf, size);
+}
+
+int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar,
+                          unsigned int offset, void *buf, int size)
+{
+       return vfio_region_read(dev, bar, offset, buf, size);
+}
+
+int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar,
+                           unsigned int offset, const void *buf, int size)
+{
+       return vfio_region_write(dev, bar, offset, buf, size);
+}
diff --git a/arch/um/drivers/vfio_user.h b/arch/um/drivers/vfio_user.h
new file mode 100644
index 000000000000..75535e05059b
--- /dev/null
+++ b/arch/um/drivers/vfio_user.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_VFIO_USER_H
+#define __UM_VFIO_USER_H
+
+struct uml_vfio_user_device {
+       int device;
+
+       struct {
+               uint64_t size;
+               uint64_t offset;
+       } *region;
+       int num_regions;
+
+       int32_t *irqfd;
+       int irq_count;
+};
+
+int uml_vfio_user_open_container(void);
+int uml_vfio_user_setup_iommu(int container);
+
+int uml_vfio_user_get_group_id(const char *device);
+int uml_vfio_user_open_group(int group_id);
+int uml_vfio_user_set_container(int container, int group);
+int uml_vfio_user_unset_container(int container, int group);
+
+int uml_vfio_user_setup_device(struct uml_vfio_user_device *dev,
+                              int group, const char *device);
+void uml_vfio_user_teardown_device(struct uml_vfio_user_device *dev);
+
+int uml_vfio_user_activate_irq(struct uml_vfio_user_device *dev, int index);
+void uml_vfio_user_deactivate_irq(struct uml_vfio_user_device *dev, int index);
+int uml_vfio_user_update_irqs(struct uml_vfio_user_device *dev);
+
+int uml_vfio_user_cfgspace_read(struct uml_vfio_user_device *dev,
+                               unsigned int offset, void *buf, int size);
+int uml_vfio_user_cfgspace_write(struct uml_vfio_user_device *dev,
+                                unsigned int offset, const void *buf, int 
size);
+
+int uml_vfio_user_bar_read(struct uml_vfio_user_device *dev, int bar,
+                          unsigned int offset, void *buf, int size);
+int uml_vfio_user_bar_write(struct uml_vfio_user_device *dev, int bar,
+                           unsigned int offset, const void *buf, int size);
+
+#endif /* __UM_VFIO_USER_H */
-- 
2.34.1


Reply via email to