This patch adds an operation callback which gets called every time the library wants to call eventfd_write(). This eventfd_write() call could result in a system call, which could potentially block the PMD thread.
The callback function can decide whether it's ok to handle the eventfd_write() now or have the newly introduced function, rte_vhost_notify_guest(), called at a later time. This can be used by 3rd party applications, like OVS, to avoid system calls being called as part of the PMD threads. Signed-off-by: Eelco Chaudron <echau...@redhat.com> --- lib/vhost/meson.build | 2 + lib/vhost/rte_vhost.h | 23 +++++++++++++++- lib/vhost/socket.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++--- lib/vhost/version.map | 9 ++++++ lib/vhost/vhost.c | 38 ++++++++++++++++++++++++++ lib/vhost/vhost.h | 65 +++++++++++++++++++++++++++++++------------- 6 files changed, 184 insertions(+), 25 deletions(-) diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build index 197a51d936..e93ba6b078 100644 --- a/lib/vhost/meson.build +++ b/lib/vhost/meson.build @@ -39,3 +39,5 @@ driver_sdk_headers = files( 'vdpa_driver.h', ) deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] + +use_function_versioning = true diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h index 58a5d4be92..7a10bc36cf 100644 --- a/lib/vhost/rte_vhost.h +++ b/lib/vhost/rte_vhost.h @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { */ void (*guest_notified)(int vid); - void *reserved[1]; /**< Reserved for future extension */ + /** + * If this callback is registered, notification to the guest can + * be handled by the front-end calling rte_vhost_notify_guest(). + * If it's not handled, 'false' should be returned. This can be used + * to remove the "slow" eventfd_write() syscall from the datapath. + */ + bool (*guest_notify)(int vid, uint16_t queue_id); }; /** @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t vring_idx, int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable); +/** + * @warning + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice. + * + * Inject the offloaded interrupt into the vhost device's queue. For more + * details see the 'guest_notify' vhost device operation. + * + * @param vid + * vhost device ID + * @param queue_id + * virtio queue index + */ +__rte_experimental +void rte_vhost_notify_guest(int vid, uint16_t queue_id); + /** * Register vhost driver. path could be different for multiple * instance support. diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c index 669c322e12..787d6bacf8 100644 --- a/lib/vhost/socket.c +++ b/lib/vhost/socket.c @@ -15,6 +15,7 @@ #include <fcntl.h> #include <pthread.h> +#include <rte_function_versioning.h> #include <rte_log.h> #include "fd_man.h" @@ -43,6 +44,7 @@ struct vhost_user_socket { bool async_copy; bool net_compliant_ol_flags; bool stats_enabled; + bool alloc_notify_ops; /* * The "supported_features" indicates the feature bits the @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket *vsocket) vsocket->path = NULL; } + if (vsocket && vsocket->alloc_notify_ops) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + free((struct rte_vhost_device_ops *)vsocket->notify_ops); +#pragma GCC diagnostic pop + vsocket->notify_ops = NULL; + } + if (vsocket) { free(vsocket); vsocket = NULL; @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path) /* * Register ops so that we can add/remove device to data core. */ -int -rte_vhost_driver_callback_register(const char *path, - struct rte_vhost_device_ops const * const ops) +static int +rte_vhost_driver_callback_register__(const char *path, + struct rte_vhost_device_ops const * const ops, bool ops_allocated) { struct vhost_user_socket *vsocket; pthread_mutex_lock(&vhost_user.mutex); vsocket = find_vhost_user_socket(path); - if (vsocket) + if (vsocket) { + if (vsocket->alloc_notify_ops) { + vsocket->alloc_notify_ops = false; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + free((struct rte_vhost_device_ops *)vsocket->notify_ops); +#pragma GCC diagnostic pop + } vsocket->notify_ops = ops; + if (ops_allocated) + vsocket->alloc_notify_ops = true; + } pthread_mutex_unlock(&vhost_user.mutex); return vsocket ? 0 : -1; } +int __vsym +rte_vhost_driver_callback_register_v24(const char *path, + struct rte_vhost_device_ops const * const ops) +{ + return rte_vhost_driver_callback_register__(path, ops, false); +} + +int __vsym +rte_vhost_driver_callback_register_v23(const char *path, + struct rte_vhost_device_ops const * const ops) +{ + int ret; + + /* + * Although the ops structure is a const structure, we do need to + * override the guest_notify operation. This is because with the + * previous APIs it was "reserved" and if any garbage value was passed, + * it could crash the application. + */ + if (ops && !ops->guest_notify) { + struct rte_vhost_device_ops *new_ops; + + new_ops = malloc(sizeof(*new_ops)); + if (new_ops == NULL) + return -1; + + memcpy(new_ops, ops, sizeof(*new_ops)); + new_ops->guest_notify = NULL; + + ret = rte_vhost_driver_callback_register__(path, ops, true); + } else { + ret = rte_vhost_driver_callback_register__(path, ops, false); + } + + return ret; +} + +/* Mark the v23 function as the old version, and v24 as the default version. */ +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23); +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24); +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path, + struct rte_vhost_device_ops const * const ops), + rte_vhost_driver_callback_register_v24); + struct rte_vhost_device_ops const * vhost_driver_callback_get(const char *path) { diff --git a/lib/vhost/version.map b/lib/vhost/version.map index d322a4a888..7bcbfd12cf 100644 --- a/lib/vhost/version.map +++ b/lib/vhost/version.map @@ -64,6 +64,12 @@ DPDK_23 { local: *; }; +DPDK_24 { + global: + + rte_vhost_driver_callback_register; +} DPDK_23; + EXPERIMENTAL { global: @@ -98,6 +104,9 @@ EXPERIMENTAL { # added in 22.11 rte_vhost_async_dma_unconfigure; rte_vhost_vring_call_nonblock; + + # added in 23.07 + rte_vhost_notify_guest; }; INTERNAL { diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c index 8ff6434c93..79e88f986e 100644 --- a/lib/vhost/vhost.c +++ b/lib/vhost/vhost.c @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off vhost_vq_stat_strings[] = { {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, stats.size_bins[6])}, {"size_1519_max_packets", offsetof(struct vhost_virtqueue, stats.size_bins[7])}, {"guest_notifications", offsetof(struct vhost_virtqueue, stats.guest_notifications)}, + {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue, + stats.guest_notifications_offloaded)}, + {"guest_notifications_error", offsetof(struct vhost_virtqueue, + stats.guest_notifications_error)}, {"iotlb_hits", offsetof(struct vhost_virtqueue, stats.iotlb_hits)}, {"iotlb_misses", offsetof(struct vhost_virtqueue, stats.iotlb_misses)}, {"inflight_submitted", offsetof(struct vhost_virtqueue, stats.inflight_submitted)}, @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int enable) return ret; } +void +rte_vhost_notify_guest(int vid, uint16_t queue_id) +{ + struct virtio_net *dev = get_device(vid); + struct vhost_virtqueue *vq; + + if (!dev || queue_id >= VHOST_MAX_VRING) + return; + + vq = dev->virtqueue[queue_id]; + if (!vq) + return; + + rte_rwlock_read_lock(&vq->access_lock); + + if (vq->callfd >= 0) { + int ret = eventfd_write(vq->callfd, (eventfd_t)1); + + if (ret) { + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + __atomic_fetch_add(&vq->stats.guest_notifications_error, + 1, __ATOMIC_RELAXED); + } else { + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); + } + } + + rte_rwlock_read_unlock(&vq->access_lock); +} + void rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) { diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h index 37609c7c8d..0ab75b78b5 100644 --- a/lib/vhost/vhost.h +++ b/lib/vhost/vhost.h @@ -141,6 +141,8 @@ struct virtqueue_stats { uint64_t inflight_completed; /* Counters below are atomic, and should be incremented as such. */ uint64_t guest_notifications; + uint64_t guest_notifications_offloaded; + uint64_t guest_notifications_error; }; /** @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, uint16_t old) return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - old); } +static __rte_always_inline void +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq) +{ + int ret; + + if (dev->notify_ops->guest_notify && + dev->notify_ops->guest_notify(dev->vid, vq->index)) { + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + __atomic_fetch_add(&vq->stats.guest_notifications_offloaded, + 1, __ATOMIC_RELAXED); + return; + } + + ret = eventfd_write(vq->callfd, (eventfd_t) 1); + if (ret) { + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + __atomic_fetch_add(&vq->stats.guest_notifications_error, + 1, __ATOMIC_RELAXED); + return; + } + + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) + __atomic_fetch_add(&vq->stats.guest_notifications, + 1, __ATOMIC_RELAXED); + if (dev->notify_ops->guest_notified) + dev->notify_ops->guest_notified(dev->vid); +} + + static __rte_always_inline void vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) { @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) "%s: used_event_idx=%d, old=%d, new=%d\n", __func__, vhost_used_event(vq), old, new); - if ((vhost_need_event(vhost_used_event(vq), new, old) && - (vq->callfd >= 0)) || - unlikely(!signalled_used_valid)) { - eventfd_write(vq->callfd, (eventfd_t) 1); - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - __atomic_fetch_add(&vq->stats.guest_notifications, - 1, __ATOMIC_RELAXED); - if (dev->notify_ops->guest_notified) - dev->notify_ops->guest_notified(dev->vid); + if ((vhost_need_event(vhost_used_event(vq), new, old) || + unlikely(!signalled_used_valid)) && + vq->callfd >= 0) { + vhost_vring_inject_irq(dev, vq); } } else { /* Kick the guest if necessary. */ if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) && (vq->callfd >= 0)) { - eventfd_write(vq->callfd, (eventfd_t)1); - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) - __atomic_fetch_add(&vq->stats.guest_notifications, - 1, __ATOMIC_RELAXED); - if (dev->notify_ops->guest_notified) - dev->notify_ops->guest_notified(dev->vid); + vhost_vring_inject_irq(dev, vq); } } } @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) if (vhost_need_event(off, new, old)) kick = true; kick: - if (kick) { - eventfd_write(vq->callfd, (eventfd_t)1); - if (dev->notify_ops->guest_notified) - dev->notify_ops->guest_notified(dev->vid); - } + if (kick && vq->callfd >= 0) + vhost_vring_inject_irq(dev, vq); } static __rte_always_inline void @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m) uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); + +/* Versioned functions */ +int rte_vhost_driver_callback_register_v23(const char *path, + struct rte_vhost_device_ops const * const ops); +int rte_vhost_driver_callback_register_v24(const char *path, + struct rte_vhost_device_ops const * const ops); + #endif /* _VHOST_NET_CDEV_H_ */