Minimal VFIO platform implementation supporting - register space user mapping, - IRQ assignment based on eventfds handled on qemu side.
irqfd kernel acceleration comes in a subsequent patch. Signed-off-by: Kim Phillips <kim.phill...@linaro.org> Signed-off-by: Eric Auger <eric.au...@linaro.org> --- v3 -> v4: [Eric Auger] - merge of "vfio: Add initial IRQ support in platform device" to get a full functional patch although perfs are limited. - removal of unrealize function since I currently understand it is only used with device hot-plug feature. v2 -> v3: [Eric Auger] - further factorization between PCI and platform (VFIORegion, VFIODevice). same level of functionality. <= v2: [Kim Philipps] - Initial Creation of the device supporting register space mapping --- hw/vfio/Makefile.objs | 1 + hw/vfio/platform.c | 528 ++++++++++++++++++++++++++++++++++++++++ include/hw/vfio/vfio-platform.h | 74 ++++++ 3 files changed, 603 insertions(+) create mode 100644 hw/vfio/platform.c create mode 100644 include/hw/vfio/vfio-platform.h diff --git a/hw/vfio/Makefile.objs b/hw/vfio/Makefile.objs index e31f30e..c5c76fe 100644 --- a/hw/vfio/Makefile.objs +++ b/hw/vfio/Makefile.objs @@ -1,4 +1,5 @@ ifeq ($(CONFIG_LINUX), y) obj-$(CONFIG_SOFTMMU) += common.o obj-$(CONFIG_PCI) += pci.o +obj-$(CONFIG_SOFTMMU) += platform.o endif diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c new file mode 100644 index 0000000..a5fc22b --- /dev/null +++ b/hw/vfio/platform.c @@ -0,0 +1,528 @@ +/* + * vfio based device assignment support - platform devices + * + * Copyright Linaro Limited, 2014 + * + * Authors: + * Kim Phillips <kim.phill...@linaro.org> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on vfio based PCI device assignment support: + * Copyright Red Hat, Inc. 2012 + */ + +#include <linux/vfio.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include "hw/pci/msi.h" +#include "hw/pci/msix.h" +#include "qemu/error-report.h" +#include "qemu/range.h" +#include "sysemu/sysemu.h" +#include "hw/vfio/vfio-platform.h" + +extern const MemoryRegionOps vfio_region_ops; +extern const MemoryListener vfio_memory_listener; +extern QLIST_HEAD(, VFIOGroup) group_list; +extern QLIST_HEAD(, VFIOAddressSpace) vfio_address_spaces; + +static void vfio_put_device(VFIOPlatformDevice *vdev) +{ + unsigned int i; + VFIODevice *vbasedev = &vdev->vbasedev; + + for (i = 0; i < vbasedev->num_regions; i++) { + g_free(vdev->regions[i]); + } + g_free(vdev->regions); + vfio_put_base_device(&vdev->vbasedev); +} + +/* + * It is mandatory to pass a VFIOPlatformDevice since VFIODevice + * is not a QOM Object and cannot be passed to memory region functions +*/ +static void vfio_map_region(VFIOPlatformDevice *vdev, int nr) +{ + VFIORegion *region = vdev->regions[nr]; + unsigned size = region->size; + char name[64]; + + snprintf(name, sizeof(name), "VFIO %s region %d", + vdev->vbasedev.name, nr); + + /* A "slow" read/write mapping underlies all regions */ + memory_region_init_io(®ion->mem, OBJECT(vdev), &vfio_region_ops, + region, name, size); + + strncat(name, " mmap", sizeof(name) - strlen(name) - 1); + + if (vfio_mmap_region(OBJECT(vdev), region, ®ion->mem, + ®ion->mmap_mem, ®ion->mmap, size, 0, name)) { + error_report("%s unsupported. Performance may be slow", name); + } +} + +static void print_regions(VFIOPlatformDevice *vdev) +{ + int i; + + DPRINTF("Device \"%s\" counts %d region(s):\n", + vdev->vbasedev.name, vdev->vbasedev.num_regions); + + for (i = 0; i < vdev->vbasedev.num_regions; i++) { + DPRINTF("- region %d flags = 0x%lx, size = 0x%lx, " + "fd= %d, offset = 0x%lx\n", + vdev->regions[i]->nr, + (unsigned long)vdev->regions[i]->flags, + (unsigned long)vdev->regions[i]->size, + vdev->regions[i]->fd, + (unsigned long)vdev->regions[i]->fd_offset); + } +} + +static int vfio_populate_regions(VFIODevice *vbasedev) +{ + struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) }; + int i, ret = errno; + VFIOPlatformDevice *vdev = + container_of(vbasedev, VFIOPlatformDevice, vbasedev); + + vdev->regions = g_malloc0(sizeof(VFIORegion *) * vbasedev->num_regions); + + for (i = 0; i < vbasedev->num_regions; i++) { + vdev->regions[i] = g_malloc0(sizeof(VFIORegion)); + reg_info.index = i; + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, ®_info); + if (ret) { + error_report("vfio: Error getting region %d info: %m", i); + goto error; + } + + vdev->regions[i]->flags = reg_info.flags; + vdev->regions[i]->size = reg_info.size; + vdev->regions[i]->fd_offset = reg_info.offset; + vdev->regions[i]->fd = vbasedev->fd; + vdev->regions[i]->nr = i; + vdev->regions[i]->vbasedev = vbasedev; + } + print_regions(vdev); + return ret; +error: + vfio_put_device(vdev); + return ret; +} + +/* not implemented yet */ +static int vfio_platform_check_device(VFIODevice *vdev) +{ + return 0; +} + +/* not implemented yet */ +static bool vfio_platform_compute_needs_reset(VFIODevice *vdev) +{ +return false; +} + +/* not implemented yet */ +static int vfio_platform_hot_reset_multi(VFIODevice *vdev) +{ +return 0; +} + +/* + * eoi function is called on the first access to any MMIO region + * after an IRQ was triggered. It is assumed this access corresponds + * to the IRQ status register reset. + * With such a mechanism, a single IRQ can be handled at a time since + * there is no way to know which IRQ was completed by the guest. + * (we would need additional details about the IRQ status register mask) + */ +static void vfio_platform_eoi(VFIODevice *vbasedev) +{ + VFIOINTp *intp; + VFIOPlatformDevice *vdev = + container_of(vbasedev, VFIOPlatformDevice, vbasedev); + bool eoi_done = false; + + QLIST_FOREACH(intp, &vdev->intp_list, next) { + if (intp->state == VFIO_IRQ_ACTIVE) { + if (eoi_done) { + error_report("several IRQ pending: " + "this case should not happen!\n"); + } + DPRINTF("EOI IRQ #%d fd=%d\n", + intp->pin, event_notifier_get_fd(&intp->interrupt)); + intp->state = VFIO_IRQ_INACTIVE; + + /* deassert the virtual IRQ and unmask physical one */ + qemu_set_irq(intp->qemuirq, 0); + vfio_unmask_irqindex(vbasedev, intp->pin); + eoi_done = true; + } + } + + /* + * in case there are pending IRQs, handle them one at a time */ + if (!QSIMPLEQ_EMPTY(&vdev->pending_intp_queue)) { + intp = QSIMPLEQ_FIRST(&vdev->pending_intp_queue); + vfio_intp_interrupt(intp); + QSIMPLEQ_REMOVE_HEAD(&vdev->pending_intp_queue, pqnext); + } + return; +} + +/* + * enable/disable the fast path mode + * fast path = MMIO region is mmaped (no KVM TRAP) + * slow path = MMIO region is trapped and region callbacks are called + * slow path enables to trap the IRQ status register guest reset +*/ + +static void vfio_mmap_set_enabled(VFIOPlatformDevice *vdev, bool enabled) +{ + VFIORegion *region; + int i; + + DPRINTF("fast path = %d\n", enabled); + + for (i = 0; i < vdev->vbasedev.num_regions; i++) { + region = vdev->regions[i]; + + /* register space is unmapped to trap EOI */ + memory_region_set_enabled(®ion->mmap_mem, enabled); + } +} + +/* + * Checks whether the IRQ is still pending. In the negative + * the fast path mode (where reg space is mmaped) can be restored. + * if the IRQ is still pending, we must keep on trapping IRQ status + * register reset with mmap disabled (slow path). + * the function is called on mmap_timer event. + * by construction a single fd is handled at a time. See EOI comment + * for additional details. + */ +static void vfio_intp_mmap_enable(void *opaque) +{ + VFIOINTp *tmp; + VFIOPlatformDevice *vdev = (VFIOPlatformDevice *)opaque; + bool one_active_irq = false; + + QLIST_FOREACH(tmp, &vdev->intp_list, next) { + if (tmp->state == VFIO_IRQ_ACTIVE) { + if (one_active_irq) { + error_report("several active IRQ: " + "this case should not happen!\n"); + } + DPRINTF("IRQ #%d still pending, stay in slow path\n", + tmp->pin); + timer_mod(vdev->mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + vdev->mmap_timeout); + one_active_irq = true; + } + } + if (one_active_irq) { + return; + } + DPRINTF("no pending IRQ, restore fast path\n"); + vfio_mmap_set_enabled(vdev, true); +} + +/* + * The fd handler + */ +void vfio_intp_interrupt(void *opaque) +{ + int ret; + VFIOINTp *tmp, *intp = (VFIOINTp *)opaque; + VFIOPlatformDevice *vdev = intp->vdev; + bool one_active_irq = false; + + /* + * first check whether there is a pending IRQ + * in the positive the new IRQ cannot be handled until the + * active one is not completed. + * by construction the same IRQ as the pending one cannot hit + * since the physical IRQ was disabled by the VFIO driver + */ + QLIST_FOREACH(tmp, &vdev->intp_list, next) { + if (tmp->state == VFIO_IRQ_ACTIVE) { + one_active_irq = true; + } + } + if (one_active_irq) { + /* + * the new IRQ gets a pending status and is pushed in + * the pending queue + */ + intp->state = VFIO_IRQ_PENDING; + QSIMPLEQ_INSERT_TAIL(&vdev->pending_intp_queue, + intp, pqnext); + return; + } + + /* no active IRQ, the new IRQ can be forwarded to guest */ + DPRINTF("Handle IRQ #%d (fd = %d)\n", + intp->pin, event_notifier_get_fd(&intp->interrupt)); + + ret = event_notifier_test_and_clear(&intp->interrupt); + if (!ret) { + DPRINTF("Error when clearing fd=%d\n", + event_notifier_get_fd(&intp->interrupt)); + } + + intp->state = VFIO_IRQ_ACTIVE; + + /* sets slow path */ + vfio_mmap_set_enabled(vdev, false); + + /* trigger the virtual IRQ */ + qemu_set_irq(intp->qemuirq, 1); + + /* schedule the mmap timer which will restore mmap path after EOI*/ + if (vdev->mmap_timeout) { + timer_mod(vdev->mmap_timer, + qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + vdev->mmap_timeout); + } +} + +static int vfio_enable_intp(VFIODevice *vbasedev, unsigned int index) +{ + struct vfio_irq_set *irq_set; + int32_t *pfd; + int ret, argsz; + int device = vbasedev->fd; + VFIOPlatformDevice *vdev = + container_of(vbasedev, VFIOPlatformDevice, vbasedev); + SysBusDevice *sbdev = SYS_BUS_DEVICE(vdev); + VFIOINTp *intp; + + /* allocate and populate a new VFIOINTp structure put in a queue list */ + intp = g_malloc0(sizeof(*intp)); + intp->vdev = vdev; + intp->pin = index; + intp->state = VFIO_IRQ_INACTIVE; + sysbus_init_irq(sbdev, &intp->qemuirq); + + ret = event_notifier_init(&intp->interrupt, 0); + + if (ret) { + error_report("vfio: Error: event_notifier_init failed "); + return ret; + } + /* build the irq_set to be passed to the vfio kernel driver */ + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = index; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&intp->interrupt); + + DPRINTF("register fd=%d/irq index=%d to kernel\n", *pfd, index); + + qemu_set_fd_handler(*pfd, vfio_intp_interrupt, NULL, intp); + + /* + * pass the index/fd binding to the kernel driver so that it + * triggers this fd on HW IRQ + */ + ret = ioctl(device, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (ret) { + error_report("vfio: Error: Failed to pass IRQ fd to the driver: %m"); + qemu_set_fd_handler(*pfd, NULL, NULL, NULL); + close(*pfd); /* TO DO : replace by event_notifier_cleanup */ + return -errno; + } + + /* store the new intp in qlist */ + QLIST_INSERT_HEAD(&vdev->intp_list, intp, next); + return 0; +} + +static int vfio_populate_interrupts(VFIODevice *vbasedev) +{ + struct vfio_irq_info irq = { .argsz = sizeof(irq) }; + int i, ret; + VFIOPlatformDevice *vdev = + container_of(vbasedev, VFIOPlatformDevice, vbasedev); + + vdev->mmap_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL, + vfio_intp_mmap_enable, vdev); + + QSIMPLEQ_INIT(&vdev->pending_intp_queue); + + for (i = 0; i < vbasedev->num_irqs; i++) { + irq.index = i; + + DPRINTF("Retrieve IRQ info from vfio platform driver ...\n"); + + ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_IRQ_INFO, &irq); + if (ret) { + error_printf("vfio: error getting device %s irq info", + vbasedev->name); + } + DPRINTF("- IRQ index %d: count %d, flags=0x%x\n", + irq.index, irq.count, irq.flags); + + vfio_enable_intp(vbasedev, irq.index); + } + return 0; +} + +static VFIODeviceOps vfio_platform_ops = { + .vfio_compute_needs_reset = vfio_platform_compute_needs_reset, + .vfio_hot_reset_multi = vfio_platform_hot_reset_multi, + .vfio_eoi = vfio_platform_eoi, + .vfio_check_device = vfio_platform_check_device, + .vfio_populate_regions = vfio_populate_regions, + .vfio_populate_interrupts = vfio_populate_interrupts, +}; + +static int vfio_base_device_init(VFIODevice *vbasedev) +{ + VFIOGroup *group; + VFIODevice *vbasedev_iter; + char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name; + ssize_t len; + struct stat st; + int groupid; + int ret; + + /* name must be set prior to the call */ + if (vbasedev->name == NULL) { + return -errno; + } + + /* Check that the host device exists */ + snprintf(path, sizeof(path), "/sys/bus/platform/devices/%s/", + vbasedev->name); + + if (stat(path, &st) < 0) { + error_report("vfio: error: no such host device: %s", path); + return -errno; + } + + strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1); + len = readlink(path, iommu_group_path, sizeof(path)); + if (len <= 0 || len >= sizeof(path)) { + error_report("vfio: error no iommu_group for device"); + return len < 0 ? -errno : ENAMETOOLONG; + } + + iommu_group_path[len] = 0; + group_name = basename(iommu_group_path); + + if (sscanf(group_name, "%d", &groupid) != 1) { + error_report("vfio: error reading %s: %m", path); + return -errno; + } + + DPRINTF("%s(%s) group %d\n", __func__, vbasedev->name, groupid); + + group = vfio_get_group(groupid, &address_space_memory); + if (!group) { + error_report("vfio: failed to get group %d", groupid); + return -ENOENT; + } + + snprintf(path, sizeof(path), "%s", vbasedev->name); + + QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { + if (strcmp(vbasedev_iter->name, vbasedev->name) == 0) { + error_report("vfio: error: device %s is already attached", path); + vfio_put_group(group); + return -EBUSY; + } + } + ret = vfio_get_device(group, path, vbasedev); + if (ret < 0) { + error_report("vfio: failed to get device %s", path); + vfio_put_group(group); + return ret; + } + return ret; +} + +static void vfio_platform_realize(DeviceState *dev, Error **errp) +{ + VFIOPlatformDevice *vdev = VFIO_PLATFORM_DEVICE(dev); + SysBusDevice *sbdev = SYS_BUS_DEVICE(dev); + VFIODevice *vbasedev = &vdev->vbasedev; + int i, ret; + + vbasedev->type = VFIO_DEVICE_TYPE_PLATFORM; + vbasedev->ops = &vfio_platform_ops; + + DPRINTF("vfio device %s, compat = %s\n", vbasedev->name, vdev->compat); + + ret = vfio_base_device_init(vbasedev); + if (ret < 0) { + return; + } + + for (i = 0; i < vbasedev->num_regions; i++) { + vfio_map_region(vdev, i); + sysbus_init_mmio(sbdev, &vdev->regions[i]->mem); + } +} + +static const VMStateDescription vfio_platform_vmstate = { + .name = TYPE_VFIO_PLATFORM, + .version_id = 3, + .minimum_version_id = 2, + .fields = (VMStateField[]) { + VMSTATE_END_OF_LIST() + }, + .unmigratable = 1, +}; + +static Property vfio_platform_dev_properties[] = { + DEFINE_PROP_STRING("vfio_device", VFIOPlatformDevice, vbasedev.name), + DEFINE_PROP_STRING("compat", VFIOPlatformDevice, compat), + DEFINE_PROP_UINT32("mmap-timeout-ms", VFIOPlatformDevice, + mmap_timeout, 1100), + DEFINE_PROP_UINT32("num_irqs", VFIOPlatformDevice, + vbasedev.num_irqs, 0), + DEFINE_PROP_UINT32("num_regions", VFIOPlatformDevice, + vbasedev.num_regions, 0), + DEFINE_PROP_BOOL("irqfd", VFIOPlatformDevice, irqfd_allowed, true), + DEFINE_PROP_END_OF_LIST(), +}; + +static void vfio_platform_class_init(ObjectClass *klass, void *data) +{ + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->realize = vfio_platform_realize; + dc->props = vfio_platform_dev_properties; + dc->vmsd = &vfio_platform_vmstate; + dc->desc = "VFIO-based platform device assignment"; + set_bit(DEVICE_CATEGORY_MISC, dc->categories); +} + +static const TypeInfo vfio_platform_dev_info = { + .name = TYPE_VFIO_PLATFORM, + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VFIOPlatformDevice), + .class_init = vfio_platform_class_init, + .class_size = sizeof(VFIOPlatformDeviceClass), +}; + +static void register_vfio_platform_dev_type(void) +{ + type_register_static(&vfio_platform_dev_info); +} + +type_init(register_vfio_platform_dev_type) diff --git a/include/hw/vfio/vfio-platform.h b/include/hw/vfio/vfio-platform.h new file mode 100644 index 0000000..134fc1e --- /dev/null +++ b/include/hw/vfio/vfio-platform.h @@ -0,0 +1,74 @@ +/* + * vfio based device assignment support - platform devices + * + * Copyright Linaro Limited, 2014 + * + * Authors: + * Kim Phillips <kim.phill...@linaro.org> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Based on vfio based PCI device assignment support: + * Copyright Red Hat, Inc. 2012 + */ + +#ifndef HW_VFIO_VFIO_PLATFORM_H +#define HW_VFIO_VFIO_PLATFORM_H + +#include "hw/sysbus.h" +#include "hw/vfio/vfio-common.h" + +#define TYPE_VFIO_PLATFORM "vfio-platform" + +enum { + VFIO_IRQ_INACTIVE = 0, + VFIO_IRQ_PENDING = 1, + VFIO_IRQ_ACTIVE = 2, + /* VFIO_IRQ_ACTIVE_AND_PENDING cannot happen with VFIO */ +}; + +typedef struct VFIOINTp { + QLIST_ENTRY(VFIOINTp) next; /* entry for IRQ list */ + QSIMPLEQ_ENTRY(VFIOINTp) pqnext; /* entry for pending IRQ queue */ + EventNotifier interrupt; /* eventfd triggered on interrupt */ + EventNotifier unmask; /* eventfd for unmask on QEMU bypass */ + qemu_irq qemuirq; + struct VFIOPlatformDevice *vdev; /* back pointer to device */ + int state; /* inactive, pending, active */ + bool kvm_accel; /* set when QEMU bypass through KVM enabled */ + uint8_t pin; /* index */ + uint8_t virtualID; /* virtual IRQ */ +} VFIOINTp; + +typedef struct VFIOPlatformDevice { + SysBusDevice sbdev; + VFIODevice vbasedev; /* not a QOM object */ + VFIORegion **regions; + QLIST_HEAD(, VFIOINTp) intp_list; /* list of IRQ */ + /* queue of pending IRQ */ + QSIMPLEQ_HEAD(pending_intp_queue, VFIOINTp) pending_intp_queue; + char *compat; /* compatibility string */ + bool irqfd_allowed; + uint32_t mmap_timeout; /* delay to re-enable mmaps after interrupt */ + QEMUTimer *mmap_timer; /* enable mmaps after periods w/o interrupts */ +} VFIOPlatformDevice; + + +typedef struct VFIOPlatformDeviceClass { + /*< private >*/ + SysBusDeviceClass parent_class; + /*< public >*/ +} VFIOPlatformDeviceClass; + +#define VFIO_PLATFORM_DEVICE(obj) \ + OBJECT_CHECK(VFIOPlatformDevice, (obj), TYPE_VFIO_PLATFORM) +#define VFIO_PLATFORM_DEVICE_CLASS(klass) \ + OBJECT_CLASS_CHECK(VFIOPlatformDeviceClass, (klass), TYPE_VFIO_PLATFORM) +#define VFIO_PLATFORM_DEVICE_GET_CLASS(obj) \ + OBJECT_GET_CLASS(VFIOPlatformDeviceClass, (obj), TYPE_VFIO_PLATFORM) + +void vfio_intp_interrupt(void *opaque); +void vfio_setup_irqfd(SysBusDevice *dev, int index, int virq); + +#endif -- 1.8.3.2