On Wed, Nov 17, 2010 at 04:19:27PM +0000, Stefan Hajnoczi wrote: > Virtqueue notify is currently handled synchronously in userspace virtio. This > prevents the vcpu from executing guest code while hardware emulation code > handles the notify. > > On systems that support KVM, the ioeventfd mechanism can be used to make > virtqueue notify a lightweight exit by deferring hardware emulation to the > iothread and allowing the VM to continue execution. This model is similar to > how vhost receives virtqueue notifies. > > The result of this change is improved performance for userspace virtio > devices. > Virtio-blk throughput increases especially for multithreaded scenarios and > virtio-net transmit throughput increases substantially. > > Some virtio devices are known to have guest drivers which expect a notify to > be > processed synchronously and spin waiting for completion. Only enable > ioeventfd > for virtio-blk and virtio-net for now. > > Care must be taken not to interfere with vhost-net, which uses host > notifiers. If the set_host_notifier() API is used by a device > virtio-pci will disable virtio-ioeventfd and let the device deal with > host notifiers as it wishes. > > After migration and on VM change state (running/paused) virtio-ioeventfd > will enable/disable itself. > > * VIRTIO_CONFIG_S_DRIVER_OK -> enable virtio-ioeventfd > * !VIRTIO_CONFIG_S_DRIVER_OK -> disable virtio-ioeventfd > * virtio_pci_set_host_notifier() -> disable virtio-ioeventfd > * vm_change_state(running=0) -> disable virtio-ioeventfd > * vm_change_state(running=1) -> enable virtio-ioeventfd > > Signed-off-by: Stefan Hajnoczi <stefa...@linux.vnet.ibm.com>
So this is pretty clean. Two things that worry me a bit: - With vhost-net, it seems that we might now run in userspace just before start or just after stop. This means that e.g. if there's a ring processing bug in userspace we'll get hit by it, which I'd like to avoid. - There seems to be a bit of duplication where we call start/stop in a similar way in lots of places. There are really all places where we call set_status. Might be nice to find a way to reduce that duplication. I'm trying to think of ways to improve this a bit, if I don't find any way to improve it I guess I'll just apply the series as is. > --- > hw/virtio-pci.c | 188 > ++++++++++++++++++++++++++++++++++++++++++++++++------- > hw/virtio.c | 14 +++- > hw/virtio.h | 1 + > 3 files changed, 177 insertions(+), 26 deletions(-) > > diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c > index 549118d..0217fda 100644 > --- a/hw/virtio-pci.c > +++ b/hw/virtio-pci.c > @@ -83,6 +83,11 @@ > /* Flags track per-device state like workarounds for quirks in older guests. > */ > #define VIRTIO_PCI_FLAG_BUS_MASTER_BUG (1 << 0) > > +/* Performance improves when virtqueue kick processing is decoupled from the > + * vcpu thread using ioeventfd for some devices. */ > +#define VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT 1 > +#define VIRTIO_PCI_FLAG_USE_IOEVENTFD (1 << > VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT) > + > /* QEMU doesn't strictly need write barriers since everything runs in > * lock-step. We'll leave the calls to wmb() in though to make it obvious > for > * KVM or if kqemu gets SMP support. > @@ -107,6 +112,8 @@ typedef struct { > /* Max. number of ports we can have for a the virtio-serial device */ > uint32_t max_virtserial_ports; > virtio_net_conf net; > + bool ioeventfd_started; > + VMChangeStateEntry *vm_change_state_entry; > } VirtIOPCIProxy; > > /* virtio device */ > @@ -179,12 +186,129 @@ static int virtio_pci_load_queue(void * opaque, int n, > QEMUFile *f) > return 0; > } > > +static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy, > + int n, bool assign) > +{ > + VirtQueue *vq = virtio_get_queue(proxy->vdev, n); > + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); > + int r; > + if (assign) { > + r = event_notifier_init(notifier, 1); > + if (r < 0) { > + return r; > + } > + r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier), > + proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, > + n, assign); > + if (r < 0) { > + event_notifier_cleanup(notifier); > + } > + } else { > + r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier), > + proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, > + n, assign); > + if (r < 0) { > + return r; > + } > + event_notifier_cleanup(notifier); > + } > + return r; > +} > + > +static void virtio_pci_host_notifier_read(void *opaque) > +{ > + VirtQueue *vq = opaque; > + EventNotifier *n = virtio_queue_get_host_notifier(vq); > + if (event_notifier_test_and_clear(n)) { > + virtio_queue_notify_vq(vq); > + } > +} > + > +static void virtio_pci_set_host_notifier_fd_handler(VirtIOPCIProxy *proxy, > + int n, bool assign) > +{ > + VirtQueue *vq = virtio_get_queue(proxy->vdev, n); > + EventNotifier *notifier = virtio_queue_get_host_notifier(vq); > + if (assign) { > + qemu_set_fd_handler(event_notifier_get_fd(notifier), > + virtio_pci_host_notifier_read, NULL, vq); > + } else { > + qemu_set_fd_handler(event_notifier_get_fd(notifier), > + NULL, NULL, NULL); > + } > +} > + > +static int virtio_pci_start_ioeventfd(VirtIOPCIProxy *proxy) > +{ > + int n, r; > + > + if (!(proxy->flags & VIRTIO_PCI_FLAG_USE_IOEVENTFD) || > + proxy->ioeventfd_started) { > + return 0; > + } > + > + for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) { > + if (!virtio_queue_get_num(proxy->vdev, n)) { > + continue; > + } > + > + r = virtio_pci_set_host_notifier_internal(proxy, n, true); > + if (r < 0) { > + goto assign_error; > + } > + > + virtio_pci_set_host_notifier_fd_handler(proxy, n, true); > + } > + proxy->ioeventfd_started = true; > + return 0; > + > +assign_error: > + while (--n >= 0) { > + if (!virtio_queue_get_num(proxy->vdev, n)) { > + continue; > + } > + > + virtio_pci_set_host_notifier_fd_handler(proxy, n, false); > + virtio_pci_set_host_notifier_internal(proxy, n, false); > + virtio_queue_notify(proxy->vdev, n); > + } > + proxy->ioeventfd_started = false; > + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; > + return r; > +} > + > +static int virtio_pci_stop_ioeventfd(VirtIOPCIProxy *proxy) > +{ > + int n; > + > + if (!proxy->ioeventfd_started) { > + return 0; > + } > + > + for (n = 0; n < VIRTIO_PCI_QUEUE_MAX; n++) { > + if (!virtio_queue_get_num(proxy->vdev, n)) { > + continue; > + } > + > + virtio_pci_set_host_notifier_fd_handler(proxy, n, false); > + virtio_pci_set_host_notifier_internal(proxy, n, false); > + > + /* Now notify the vq to handle the race condition where the guest > + * kicked and we deassigned before we got around to handling the > kick. > + */ Can't we just call event_notifier_test_and_clear to figure out whether this happened? This seems cleaner than calling the notifier unconditionally. > + virtio_queue_notify(proxy->vdev, n); > + } > + proxy->ioeventfd_started = false; > + return 0; > +} > + > static void virtio_pci_reset(DeviceState *d) > { > VirtIOPCIProxy *proxy = container_of(d, VirtIOPCIProxy, pci_dev.qdev); > + virtio_pci_stop_ioeventfd(proxy); > virtio_reset(proxy->vdev); > msix_reset(&proxy->pci_dev); > - proxy->flags = 0; > + proxy->flags &= ~VIRTIO_PCI_FLAG_BUS_MASTER_BUG; > } > > static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > @@ -209,6 +333,7 @@ static void virtio_ioport_write(void *opaque, uint32_t > addr, uint32_t val) > case VIRTIO_PCI_QUEUE_PFN: > pa = (target_phys_addr_t)val << VIRTIO_PCI_QUEUE_ADDR_SHIFT; > if (pa == 0) { > + virtio_pci_stop_ioeventfd(proxy); > virtio_reset(proxy->vdev); > msix_unuse_all_vectors(&proxy->pci_dev); > } > @@ -223,6 +348,12 @@ static void virtio_ioport_write(void *opaque, uint32_t > addr, uint32_t val) > virtio_queue_notify(vdev, val); > break; > case VIRTIO_PCI_STATUS: > + if (val & VIRTIO_CONFIG_S_DRIVER_OK) { > + virtio_pci_start_ioeventfd(proxy); > + } else { > + virtio_pci_stop_ioeventfd(proxy); > + } > + > virtio_set_status(vdev, val & 0xFF); > if (vdev->status == 0) { > virtio_reset(proxy->vdev); > @@ -403,6 +534,7 @@ static void virtio_write_config(PCIDevice *pci_dev, > uint32_t address, > if (PCI_COMMAND == address) { > if (!(val & PCI_COMMAND_MASTER)) { > if (!(proxy->flags & VIRTIO_PCI_FLAG_BUS_MASTER_BUG)) { > + virtio_pci_stop_ioeventfd(proxy); > virtio_set_status(proxy->vdev, > proxy->vdev->status & > ~VIRTIO_CONFIG_S_DRIVER_OK); > } > @@ -480,30 +612,27 @@ assign_error: > static int virtio_pci_set_host_notifier(void *opaque, int n, bool assign) > { > VirtIOPCIProxy *proxy = opaque; > - VirtQueue *vq = virtio_get_queue(proxy->vdev, n); > - EventNotifier *notifier = virtio_queue_get_host_notifier(vq); > - int r; > - if (assign) { > - r = event_notifier_init(notifier, 1); > - if (r < 0) { > - return r; > - } > - r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier), > - proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, > - n, assign); > - if (r < 0) { > - event_notifier_cleanup(notifier); > - } > + > + /* Stop using ioeventfd for virtqueue kick if the device starts using > host > + * notifiers. This makes it easy to avoid stepping on each others' toes. > + */ > + if (proxy->ioeventfd_started) { > + virtio_pci_stop_ioeventfd(proxy); > + proxy->flags &= ~VIRTIO_PCI_FLAG_USE_IOEVENTFD; > + } > + > + return virtio_pci_set_host_notifier_internal(proxy, n, assign); > +} > + > +static void virtio_pci_vm_change_state_handler(void *opaque, int running, > int reason) > +{ > + VirtIOPCIProxy *proxy = opaque; > + > + if (running && (proxy->vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { > + virtio_pci_start_ioeventfd(proxy); > } else { > - r = kvm_set_ioeventfd_pio_word(event_notifier_get_fd(notifier), > - proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY, > - n, assign); > - if (r < 0) { > - return r; > - } > - event_notifier_cleanup(notifier); > + virtio_pci_stop_ioeventfd(proxy); > } > - return r; > } > > static const VirtIOBindings virtio_pci_bindings = { > @@ -563,6 +692,10 @@ static void virtio_init_pci(VirtIOPCIProxy *proxy, > VirtIODevice *vdev, > proxy->host_features |= 0x1 << VIRTIO_F_NOTIFY_ON_EMPTY; > proxy->host_features |= 0x1 << VIRTIO_F_BAD_FEATURE; > proxy->host_features = vdev->get_features(vdev, proxy->host_features); > + > + proxy->vm_change_state_entry = qemu_add_vm_change_state_handler( > + virtio_pci_vm_change_state_handler, > + proxy); > } > > static int virtio_blk_init_pci(PCIDevice *pci_dev) > @@ -590,6 +723,9 @@ static int virtio_blk_init_pci(PCIDevice *pci_dev) > > static int virtio_exit_pci(PCIDevice *pci_dev) > { > + VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > + > + qemu_del_vm_change_state_handler(proxy->vm_change_state_entry); > return msix_uninit(pci_dev); > } > > @@ -597,6 +733,7 @@ static int virtio_blk_exit_pci(PCIDevice *pci_dev) > { > VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > > + virtio_pci_stop_ioeventfd(proxy); > virtio_blk_exit(proxy->vdev); > blockdev_mark_auto_del(proxy->block.bs); > return virtio_exit_pci(pci_dev); > @@ -658,6 +795,7 @@ static int virtio_net_exit_pci(PCIDevice *pci_dev) > { > VirtIOPCIProxy *proxy = DO_UPCAST(VirtIOPCIProxy, pci_dev, pci_dev); > > + virtio_pci_stop_ioeventfd(proxy); > virtio_net_exit(proxy->vdev); > return virtio_exit_pci(pci_dev); > } > @@ -702,6 +840,8 @@ static PCIDeviceInfo virtio_info[] = { > .qdev.props = (Property[]) { > DEFINE_PROP_HEX32("class", VirtIOPCIProxy, class_code, 0), > DEFINE_BLOCK_PROPERTIES(VirtIOPCIProxy, block), > + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, > + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), > DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2), > DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features), > DEFINE_PROP_END_OF_LIST(), > @@ -714,6 +854,8 @@ static PCIDeviceInfo virtio_info[] = { > .exit = virtio_net_exit_pci, > .romfile = "pxe-virtio.bin", > .qdev.props = (Property[]) { > + DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, > + VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true), > DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3), > DEFINE_VIRTIO_NET_FEATURES(VirtIOPCIProxy, host_features), > DEFINE_NIC_PROPERTIES(VirtIOPCIProxy, nic), > diff --git a/hw/virtio.c b/hw/virtio.c > index a2a657e..1f83b2c 100644 > --- a/hw/virtio.c > +++ b/hw/virtio.c > @@ -574,11 +574,19 @@ int virtio_queue_get_num(VirtIODevice *vdev, int n) > return vdev->vq[n].vring.num; > } > > +void virtio_queue_notify_vq(VirtQueue *vq) > +{ > + if (vq->vring.desc) { > + VirtIODevice *vdev = vq->vdev; > + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); > + vq->handle_output(vdev, vq); > + } > +} > + > void virtio_queue_notify(VirtIODevice *vdev, int n) > { > - if (n < VIRTIO_PCI_QUEUE_MAX && vdev->vq[n].vring.desc) { > - trace_virtio_queue_notify(vdev, n, &vdev->vq[n]); > - vdev->vq[n].handle_output(vdev, &vdev->vq[n]); > + if (n < VIRTIO_PCI_QUEUE_MAX) { > + virtio_queue_notify_vq(&vdev->vq[n]); > } > } > > diff --git a/hw/virtio.h b/hw/virtio.h > index 02fa312..5ae521c 100644 > --- a/hw/virtio.h > +++ b/hw/virtio.h > @@ -219,5 +219,6 @@ void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, > int n, uint16_t idx); > VirtQueue *virtio_get_queue(VirtIODevice *vdev, int n); > EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq); > EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq); > +void virtio_queue_notify_vq(VirtQueue *vq); > void virtio_irq(VirtQueue *vq); > #endif > -- > 1.7.2.3