On 09/30/2015 03:28 PM, Stephen Hemminger wrote:
> This driver allows using PCI device with Message Signalled Interrupt
> from userspace. The API is similar to the igb_uio driver used by the DPDK.
> Via ioctl it provides a mechanism to map MSI-X interrupts into event
> file descriptors similar to VFIO.
>
> VFIO is a better choice if IOMMU is available, but often userspace drivers
> have to work in environments where IOMMU support (real or emulated) is
> not available.  All UIO drivers that support DMA are not secure against
> rogue userspace applications programming DMA hardware to access
> private memory; this driver is no less secure than existing code.
>
> Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
> ---
>   drivers/uio/Kconfig          |   9 ++
>   drivers/uio/Makefile         |   1 +
>   drivers/uio/uio_msi.c        | 378 
> +++++++++++++++++++++++++++++++++++++++++++
>   include/uapi/linux/Kbuild    |   1 +
>   include/uapi/linux/uio_msi.h |  22 +++
>   5 files changed, 411 insertions(+)
>   create mode 100644 drivers/uio/uio_msi.c
>   create mode 100644 include/uapi/linux/uio_msi.h
>
> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> index 52c98ce..04adfa0 100644
> --- a/drivers/uio/Kconfig
> +++ b/drivers/uio/Kconfig
> @@ -93,6 +93,15 @@ config UIO_PCI_GENERIC
>         primarily, for virtualization scenarios.
>         If you compile this as a module, it will be called uio_pci_generic.
>
> +config UIO_PCI_MSI
> +       tristate "Generic driver supporting MSI-x on PCI Express cards"
> +     depends on PCI
> +     help
> +       Generic driver that provides Message Signalled IRQ events
> +       similar to VFIO. If IOMMMU is available please use VFIO
> +       instead since it provides more security.
> +       If you compile this as a module, it will be called uio_msi.
> +
>   config UIO_NETX
>       tristate "Hilscher NetX Card driver"
>       depends on PCI

Should you maybe instead depend on CONFIG_PCI_MSI.  Without MSI this is 
essentially just uio_pci_generic with a bit more greedy mapping setup.

> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
> index 8560dad..62fc44b 100644
> --- a/drivers/uio/Makefile
> +++ b/drivers/uio/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_UIO_NETX)        += uio_netx.o
>   obj-$(CONFIG_UIO_PRUSS)         += uio_pruss.o
>   obj-$(CONFIG_UIO_MF624)         += uio_mf624.o
>   obj-$(CONFIG_UIO_FSL_ELBC_GPCM)     += uio_fsl_elbc_gpcm.o
> +obj-$(CONFIG_UIO_PCI_MSI)    += uio_msi.o
> diff --git a/drivers/uio/uio_msi.c b/drivers/uio/uio_msi.c
> new file mode 100644
> index 0000000..802b5c4
> --- /dev/null
> +++ b/drivers/uio/uio_msi.c
> @@ -0,0 +1,378 @@
> +/*-
> + *
> + * Copyright (c) 2015 by Brocade Communications Systems, Inc.
> + * Author: Stephen Hemminger <stephen at networkplumber.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 only.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/device.h>
> +#include <linux/interrupt.h>
> +#include <linux/eventfd.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/uio_driver.h>
> +#include <linux/msi.h>
> +#include <linux/uio_msi.h>
> +
> +#define DRIVER_VERSION               "0.1.1"
> +#define MAX_MSIX_VECTORS     64
> +
> +/* MSI-X vector information */
> +struct uio_msi_pci_dev {
> +     struct uio_info info;           /* UIO driver info */
> +     struct pci_dev *pdev;           /* PCI device */
> +     struct mutex    mutex;          /* open/release/ioctl mutex */
> +     int             ref_cnt;        /* references to device */
> +     unsigned int    max_vectors;    /* MSI-X slots available */
> +     struct msix_entry *msix;        /* MSI-X vector table */
> +     struct uio_msi_irq_ctx {
> +             struct eventfd_ctx *trigger; /* vector to eventfd */
> +             char *name;             /* name in /proc/interrupts */
> +     } *ctx;
> +};
> +

I would move the definition of uio_msi_irq_ctx out of uio_msi_pci_dev. 
It would help to make this a bit more readable.

> +static irqreturn_t uio_intx_irqhandler(int irq, void *arg)
> +{
> +     struct uio_msi_pci_dev *udev = arg;
> +
> +     if (pci_check_and_mask_intx(udev->pdev)) {
> +             eventfd_signal(udev->ctx->trigger, 1);
> +             return IRQ_HANDLED;
> +     }
> +
> +     return IRQ_NONE;
> +}
> +

I would really prefer to see the intx handling dropped since there are 
already 2 different UIO drivers setup for handling INTx style 
interrupts.  Lets focus on the parts from the last decade and drop 
support for INTx now in favor of MSI-X and maybe MSI.  If we _REALLY_ 
need it we can always come back later and add it.

> +static irqreturn_t uio_msi_irqhandler(int irq, void *arg)
> +{
> +     struct eventfd_ctx *trigger = arg;
> +
> +     eventfd_signal(trigger, 1);
> +     return IRQ_HANDLED;
> +}
> +
> +/* set the mapping between vector # and existing eventfd. */
> +static int set_irq_eventfd(struct uio_msi_pci_dev *udev, u32 vec, int fd)
> +{
> +     struct eventfd_ctx *trigger;
> +     int irq, err;
> +
> +     if (vec >= udev->max_vectors) {
> +             dev_notice(&udev->pdev->dev, "vec %u >= num_vec %u\n",
> +                        vec, udev->max_vectors);
> +             return -ERANGE;
> +     }
> +
> +     irq = udev->msix[vec].vector;
> +     trigger = udev->ctx[vec].trigger;
> +     if (trigger) {
> +             /* Clearup existing irq mapping */

Minor spelling issue here, "Clear up" should be 2 words, "Cleanup" can 
be one.

> +             free_irq(irq, trigger);
> +             eventfd_ctx_put(trigger);
> +             udev->ctx[vec].trigger = NULL;
> +     }
> +
> +     /* Passing -1 is used to disable interrupt */
> +     if (fd < 0)
> +             return 0;
> +
> +     trigger = eventfd_ctx_fdget(fd);
> +     if (IS_ERR(trigger)) {
> +             err = PTR_ERR(trigger);
> +             dev_notice(&udev->pdev->dev,
> +                        "eventfd ctx get failed: %d\n", err);
> +             return err;
> +     }
> +
> +     if (udev->msix)
> +             err = request_irq(irq, uio_msi_irqhandler, 0,
> +                               udev->ctx[vec].name, trigger);
> +     else
> +             err = request_irq(irq, uio_intx_irqhandler, IRQF_SHARED,
> +                               udev->ctx[vec].name, udev);
> +
> +     if (err) {
> +             dev_notice(&udev->pdev->dev,
> +                        "request irq failed: %d\n", err);
> +             eventfd_ctx_put(trigger);
> +             return err;
> +     }
> +
> +     udev->ctx[vec].trigger = trigger;
> +     return 0;
> +}
> +
> +static int
> +uio_msi_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg)
> +{
> +     struct uio_msi_pci_dev *udev
> +             = container_of(info, struct uio_msi_pci_dev, info);
> +     struct uio_msi_irq_set hdr;
> +     int err;
> +
> +     switch (cmd) {
> +     case UIO_MSI_IRQ_SET:
> +             if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
> +                     return -EFAULT;
> +
> +             mutex_lock(&udev->mutex);
> +             err = set_irq_eventfd(udev, hdr.vec, hdr.fd);
> +             mutex_unlock(&udev->mutex);
> +             break;
> +     default:
> +             err = -EOPNOTSUPP;
> +     }
> +     return err;
> +}
> +
> +/* Opening the UIO device for first time enables MSI-X */
> +static int
> +uio_msi_open(struct uio_info *info, struct inode *inode)
> +{
> +     struct uio_msi_pci_dev *udev
> +             = container_of(info, struct uio_msi_pci_dev, info);
> +     int err = 0;
> +
> +     mutex_lock(&udev->mutex);
> +     if (udev->ref_cnt++ == 0) {
> +             if (udev->msix)
> +                     err = pci_enable_msix(udev->pdev, udev->msix,
> +                                           udev->max_vectors);
> +     }
> +     mutex_unlock(&udev->mutex);
> +
> +     return err;
> +}
> +

I agree with some other reviewers.  Why call pci_enable_msix in open? 
It seems like it would make much more sense to do this on probe, and 
then disable MSI-X on free.  I can only assume you are trying to do it 
to save on resources but the fact is this is a driver you have to 
explicitly force onto a device so you would probably be safe to assume 
that they plan to use it in the near future.

> +/* Last close of the UIO device releases/disables all IRQ's */
> +static int
> +uio_msi_release(struct uio_info *info, struct inode *inode)
> +{
> +     struct uio_msi_pci_dev *udev
> +             = container_of(info, struct uio_msi_pci_dev, info);
> +     int i;
> +
> +     mutex_lock(&udev->mutex);
> +     if (--udev->ref_cnt == 0) {
> +             for (i = 0; i < udev->max_vectors; i++) {
> +                     int irq = udev->msix[i].vector;
> +                     struct eventfd_ctx *trigger = udev->ctx[i].trigger;
> +
> +                     if (!trigger)
> +                             continue;
> +
> +                     free_irq(irq, trigger);
> +                     eventfd_ctx_put(trigger);
> +                     udev->ctx[i].trigger = NULL;
> +             }
> +
> +             if (udev->msix)
> +                     pci_disable_msix(udev->pdev);
> +     }
> +     mutex_unlock(&udev->mutex);
> +
> +     return 0;
> +}
> +
> +/* Unmap previously ioremap'd resources */
> +static void
> +release_iomaps(struct uio_mem *mem)
> +{
> +     int i;
> +
> +     for (i = 0; i < MAX_UIO_MAPS; i++, mem++) {
> +             if (mem->internal_addr)
> +                     iounmap(mem->internal_addr);
> +     }
> +}
> +
> +static int
> +setup_maps(struct pci_dev *pdev, struct uio_info *info)
> +{
> +     int i, m = 0, p = 0, err;
> +     static const char * const bar_names[] = {
> +             "BAR0", "BAR1", "BAR2", "BAR3", "BAR4", "BAR5",
> +     };
> +
> +     for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
> +             unsigned long start = pci_resource_start(pdev, i);
> +             unsigned long flags = pci_resource_flags(pdev, i);
> +             unsigned long len = pci_resource_len(pdev, i);
> +
> +             if (start == 0 || len == 0)
> +                     continue;
> +
> +             if (flags & IORESOURCE_MEM) {
> +                     void __iomem *addr;
> +
> +                     if (m >= MAX_UIO_MAPS)
> +                             continue;
> +
> +                     addr = ioremap(start, len);
> +                     if (addr == NULL) {
> +                             err = -EINVAL;
> +                             goto fail;
> +                     }
> +
> +                     info->mem[m].name = bar_names[i];
> +                     info->mem[m].addr = start;
> +                     info->mem[m].internal_addr = addr;
> +                     info->mem[m].size = len;
> +                     info->mem[m].memtype = UIO_MEM_PHYS;
> +                     ++m;
> +             } else if (flags & IORESOURCE_IO) {
> +                     if (p >= MAX_UIO_PORT_REGIONS)
> +                             continue;
> +
> +                     info->port[p].name = bar_names[i];
> +                     info->port[p].start = start;
> +                     info->port[p].size = len;
> +                     info->port[p].porttype = UIO_PORT_X86;
> +                     ++p;
> +             }
> +     }
> +
> +     return 0;
> + fail:
> +     for (i = 0; i < m; i++)
> +             iounmap(info->mem[i].internal_addr);
> +     return err;
> +}
> +

Do you really need to map IORESOURCE bars?  Most drivers I can think of 
don't use IO BARs anymore.  Maybe we could look at just dropping the 
code and adding it back later if we have a use case that absolutely 
needs it.

Also how many devices actually need resources beyond BAR 0?  I'm just 
curious as I know BAR 2 on many of the Intel devices is the register 
space related to MSI-X so now we have both the PCIe subsystem and user 
space with access to this region.

> +static int uio_msi_probe(struct pci_dev *pdev, const struct pci_device_id 
> *id)
> +{
> +     struct uio_msi_pci_dev *udev;
> +     int i, err, vectors;
> +
> +     udev = kzalloc(sizeof(struct uio_msi_pci_dev), GFP_KERNEL);
> +     if (!udev)
> +             return -ENOMEM;
> +
> +     err = pci_enable_device(pdev);
> +     if (err != 0) {
> +             dev_err(&pdev->dev, "cannot enable PCI device\n");
> +             goto fail_free;
> +     }
> +
> +     err = pci_request_regions(pdev, "uio_msi");
> +     if (err != 0) {
> +             dev_err(&pdev->dev, "Cannot request regions\n");
> +             goto fail_disable;
> +     }
> +
> +     pci_set_master(pdev);
> +
> +     /* remap resources */
> +     err = setup_maps(pdev, &udev->info);
> +     if (err)
> +             goto fail_release_iomem;
> +
> +     /* fill uio infos */
> +     udev->info.name = "uio_msi";
> +     udev->info.version = DRIVER_VERSION;
> +     udev->info.priv = udev;
> +     udev->pdev = pdev;
> +     udev->info.ioctl = uio_msi_ioctl;
> +     udev->info.open = uio_msi_open;
> +     udev->info.release = uio_msi_release;
> +     udev->info.irq = UIO_IRQ_CUSTOM;
> +     mutex_init(&udev->mutex);
> +
> +     vectors = pci_msix_vec_count(pdev);
> +     if (vectors > 0) {
> +             udev->max_vectors = min_t(u16, vectors, MAX_MSIX_VECTORS);
> +             dev_info(&pdev->dev, "using up to %u MSI-X vectors\n",
> +                     udev->max_vectors);
> +
> +             err = -ENOMEM;
> +             udev->msix = kcalloc(udev->max_vectors,
> +                                  sizeof(struct msix_entry), GFP_KERNEL);
> +             if (!udev->msix)
> +                     goto fail_release_iomem;
> +     } else if (!pci_intx_mask_supported(pdev)) {
> +             dev_err(&pdev->dev,
> +                     "device does not support MSI-X or INTX\n");
> +             err = -EINVAL;
> +             goto fail_release_iomem;
> +     } else {
> +             dev_notice(&pdev->dev, "using INTX\n");
> +             udev->info.irq_flags = IRQF_SHARED;
> +             udev->max_vectors = 1;
> +     }
> +
> +     udev->ctx = kcalloc(udev->max_vectors,
> +                         sizeof(struct uio_msi_irq_ctx), GFP_KERNEL);
> +     if (!udev->ctx)
> +             goto fail_free_msix;
> +
> +     for (i = 0; i < udev->max_vectors; i++) {
> +             udev->msix[i].entry = i;
> +
> +             udev->ctx[i].name = kasprintf(GFP_KERNEL,
> +                                           KBUILD_MODNAME "[%d](%s)",
> +                                           i, pci_name(pdev));
> +             if (!udev->ctx[i].name)
> +                     goto fail_free_ctx;
> +     }
> +
> +     /* register uio driver */
> +     err = uio_register_device(&pdev->dev, &udev->info);
> +     if (err != 0)
> +             goto fail_free_ctx;
> +
> +     pci_set_drvdata(pdev, udev);
> +     return 0;
> +
> +fail_free_ctx:
> +     for (i = 0; i < udev->max_vectors; i++)
> +             kfree(udev->ctx[i].name);
> +     kfree(udev->ctx);
> +fail_free_msix:
> +     kfree(udev->msix);
> +fail_release_iomem:
> +     release_iomaps(udev->info.mem);
> +     pci_release_regions(pdev);
> +fail_disable:
> +     pci_disable_device(pdev);
> +fail_free:
> +     kfree(udev);
> +
> +     pr_notice("%s ret %d\n", __func__, err);
> +     return err;
> +}
> +
> +static void uio_msi_remove(struct pci_dev *pdev)
> +{
> +     struct uio_info *info = pci_get_drvdata(pdev);
> +     struct uio_msi_pci_dev *udev
> +             = container_of(info, struct uio_msi_pci_dev, info);
> +     int i;
> +
> +     uio_unregister_device(info);
> +     release_iomaps(info->mem);
> +
> +     pci_release_regions(pdev);
> +     for (i = 0; i < udev->max_vectors; i++)
> +             kfree(udev->ctx[i].name);
> +     kfree(udev->ctx);
> +     kfree(udev->msix);
> +     pci_disable_device(pdev);
> +
> +     pci_set_drvdata(pdev, NULL);
> +     kfree(udev);
> +}
> +
> +static struct pci_driver uio_msi_pci_driver = {
> +     .name = "uio_msi",
> +     .probe = uio_msi_probe,
> +     .remove = uio_msi_remove,
> +};
> +
> +module_pci_driver(uio_msi_pci_driver);
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Stephen Hemminger <stephen at networkplumber.org>");
> +MODULE_DESCRIPTION("UIO driver for MSI PCI devices");
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
> index f7b2db4..d9497691 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -411,6 +411,7 @@ header-y += udp.h
>   header-y += uhid.h
>   header-y += uinput.h
>   header-y += uio.h
> +header-y += uio_msi.h
>   header-y += ultrasound.h
>   header-y += un.h
>   header-y += unistd.h
> diff --git a/include/uapi/linux/uio_msi.h b/include/uapi/linux/uio_msi.h
> new file mode 100644
> index 0000000..297de00
> --- /dev/null
> +++ b/include/uapi/linux/uio_msi.h
> @@ -0,0 +1,22 @@
> +/*
> + * UIO_MSI API definition
> + *
> + * Copyright (c) 2015 by Brocade Communications Systems, Inc.
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef _UIO_PCI_MSI_H
> +#define _UIO_PCI_MSI_H
> +
> +struct uio_msi_irq_set {
> +     u32 vec;
> +     int fd;
> +};
> +
> +#define UIO_MSI_BASE 0x86
> +#define UIO_MSI_IRQ_SET      _IOW('I', UIO_MSI_BASE+1, struct 
> uio_msi_irq_set)
> +
> +#endif
>

Reply via email to