On Wed, Apr 5, 2023 at 2:42 PM Eelco Chaudron <echau...@redhat.com> wrote: > > This patch adds an operation callback which gets called every time the > library wants to call eventfd_write(). This eventfd_write() call could > result in a system call, which could potentially block the PMD thread. > > The callback function can decide whether it's ok to handle the > eventfd_write() now or have the newly introduced function, > rte_vhost_notify_guest(), called at a later time. > > This can be used by 3rd party applications, like OVS, to avoid system > calls being called as part of the PMD threads. > > Signed-off-by: Eelco Chaudron <echau...@redhat.com> > --- > lib/vhost/meson.build | 2 + > lib/vhost/rte_vhost.h | 23 +++++++++++++++- > lib/vhost/socket.c | 72 > ++++++++++++++++++++++++++++++++++++++++++++++--- > lib/vhost/version.map | 9 ++++++ > lib/vhost/vhost.c | 38 ++++++++++++++++++++++++++ > lib/vhost/vhost.h | 65 +++++++++++++++++++++++++++++++------------- > 6 files changed, 184 insertions(+), 25 deletions(-) > > diff --git a/lib/vhost/meson.build b/lib/vhost/meson.build > index 197a51d936..e93ba6b078 100644 > --- a/lib/vhost/meson.build > +++ b/lib/vhost/meson.build > @@ -39,3 +39,5 @@ driver_sdk_headers = files( > 'vdpa_driver.h', > ) > deps += ['ethdev', 'cryptodev', 'hash', 'pci', 'dmadev'] > + > +use_function_versioning = true > diff --git a/lib/vhost/rte_vhost.h b/lib/vhost/rte_vhost.h > index 58a5d4be92..7a10bc36cf 100644 > --- a/lib/vhost/rte_vhost.h > +++ b/lib/vhost/rte_vhost.h > @@ -298,7 +298,13 @@ struct rte_vhost_device_ops { > */ > void (*guest_notified)(int vid); > > - void *reserved[1]; /**< Reserved for future extension */ > + /** > + * If this callback is registered, notification to the guest can > + * be handled by the front-end calling rte_vhost_notify_guest(). > + * If it's not handled, 'false' should be returned. This can be used > + * to remove the "slow" eventfd_write() syscall from the datapath. > + */ > + bool (*guest_notify)(int vid, uint16_t queue_id); > }; > > /** > @@ -433,6 +439,21 @@ void rte_vhost_log_used_vring(int vid, uint16_t > vring_idx, > > int rte_vhost_enable_guest_notification(int vid, uint16_t queue_id, int > enable); > > +/** > + * @warning > + * @b EXPERIMENTAL: this API may change, or be removed, without prior notice. > + * > + * Inject the offloaded interrupt into the vhost device's queue. For more > + * details see the 'guest_notify' vhost device operation. > + * > + * @param vid > + * vhost device ID > + * @param queue_id > + * virtio queue index > + */ > +__rte_experimental > +void rte_vhost_notify_guest(int vid, uint16_t queue_id); > + > /** > * Register vhost driver. path could be different for multiple > * instance support. > diff --git a/lib/vhost/socket.c b/lib/vhost/socket.c > index 669c322e12..787d6bacf8 100644 > --- a/lib/vhost/socket.c > +++ b/lib/vhost/socket.c > @@ -15,6 +15,7 @@ > #include <fcntl.h> > #include <pthread.h> > > +#include <rte_function_versioning.h> > #include <rte_log.h> > > #include "fd_man.h" > @@ -43,6 +44,7 @@ struct vhost_user_socket { > bool async_copy; > bool net_compliant_ol_flags; > bool stats_enabled; > + bool alloc_notify_ops; > > /* > * The "supported_features" indicates the feature bits the > @@ -846,6 +848,14 @@ vhost_user_socket_mem_free(struct vhost_user_socket > *vsocket) > vsocket->path = NULL; > } > > + if (vsocket && vsocket->alloc_notify_ops) { > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wcast-qual" > + free((struct rte_vhost_device_ops *)vsocket->notify_ops); > +#pragma GCC diagnostic pop > + vsocket->notify_ops = NULL; > + }
Rather than select the behavior based on a boolean (and here force the compiler to close its eyes), I would instead add a non const pointer to ops (let's say alloc_notify_ops) in vhost_user_socket. The code can then unconditionnally call free(vsocket->alloc_notify_ops); > + > if (vsocket) { > free(vsocket); > vsocket = NULL; > @@ -1099,21 +1109,75 @@ rte_vhost_driver_unregister(const char *path) > /* > * Register ops so that we can add/remove device to data core. > */ > -int > -rte_vhost_driver_callback_register(const char *path, > - struct rte_vhost_device_ops const * const ops) > +static int > +rte_vhost_driver_callback_register__(const char *path, No need for a rte_ prefix on static symbol. And we can simply call this internal helper vhost_driver_callback_register(). > + struct rte_vhost_device_ops const * const ops, bool ops_allocated) > { > struct vhost_user_socket *vsocket; > > pthread_mutex_lock(&vhost_user.mutex); > vsocket = find_vhost_user_socket(path); > - if (vsocket) > + if (vsocket) { > + if (vsocket->alloc_notify_ops) { > + vsocket->alloc_notify_ops = false; > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wcast-qual" > + free((struct rte_vhost_device_ops > *)vsocket->notify_ops); > +#pragma GCC diagnostic pop > + } > vsocket->notify_ops = ops; > + if (ops_allocated) > + vsocket->alloc_notify_ops = true; > + } > pthread_mutex_unlock(&vhost_user.mutex); > > return vsocket ? 0 : -1; > } > > +int __vsym > +rte_vhost_driver_callback_register_v24(const char *path, > + struct rte_vhost_device_ops const * const ops) > +{ > + return rte_vhost_driver_callback_register__(path, ops, false); > +} > + > +int __vsym > +rte_vhost_driver_callback_register_v23(const char *path, > + struct rte_vhost_device_ops const * const ops) > +{ > + int ret; > + > + /* > + * Although the ops structure is a const structure, we do need to > + * override the guest_notify operation. This is because with the > + * previous APIs it was "reserved" and if any garbage value was > passed, > + * it could crash the application. > + */ > + if (ops && !ops->guest_notify) { Hum, as described in the comment above, I don't think we should look at ops->guest_notify value at all. Checking ops != NULL should be enough. > + struct rte_vhost_device_ops *new_ops; > + > + new_ops = malloc(sizeof(*new_ops)); Strictly speaking, we lose the numa affinity of "ops" by calling malloc. I am unclear of the impact though. > + if (new_ops == NULL) > + return -1; > + > + memcpy(new_ops, ops, sizeof(*new_ops)); > + new_ops->guest_notify = NULL; > + > + ret = rte_vhost_driver_callback_register__(path, ops, true); > + } else { > + ret = rte_vhost_driver_callback_register__(path, ops, false); > + } > + > + return ret; > +} > + > +/* Mark the v23 function as the old version, and v24 as the default version. > */ > +VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23); > +BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24); > +MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path, > + struct rte_vhost_device_ops const * const ops), > + rte_vhost_driver_callback_register_v24); > + > struct rte_vhost_device_ops const * > vhost_driver_callback_get(const char *path) > { > diff --git a/lib/vhost/version.map b/lib/vhost/version.map > index d322a4a888..7bcbfd12cf 100644 > --- a/lib/vhost/version.map > +++ b/lib/vhost/version.map > @@ -64,6 +64,12 @@ DPDK_23 { > local: *; > }; > > +DPDK_24 { > + global: > + > + rte_vhost_driver_callback_register; > +} DPDK_23; > + > EXPERIMENTAL { > global: > > @@ -98,6 +104,9 @@ EXPERIMENTAL { > # added in 22.11 > rte_vhost_async_dma_unconfigure; > rte_vhost_vring_call_nonblock; > + > + # added in 23.07 > + rte_vhost_notify_guest; > }; > > INTERNAL { > diff --git a/lib/vhost/vhost.c b/lib/vhost/vhost.c > index 8ff6434c93..79e88f986e 100644 > --- a/lib/vhost/vhost.c > +++ b/lib/vhost/vhost.c > @@ -44,6 +44,10 @@ static const struct vhost_vq_stats_name_off > vhost_vq_stat_strings[] = { > {"size_1024_1518_packets", offsetof(struct vhost_virtqueue, > stats.size_bins[6])}, > {"size_1519_max_packets", offsetof(struct vhost_virtqueue, > stats.size_bins[7])}, > {"guest_notifications", offsetof(struct vhost_virtqueue, > stats.guest_notifications)}, > + {"guest_notifications_offloaded", offsetof(struct vhost_virtqueue, > + stats.guest_notifications_offloaded)}, > + {"guest_notifications_error", offsetof(struct vhost_virtqueue, > + stats.guest_notifications_error)}, > {"iotlb_hits", offsetof(struct vhost_virtqueue, > stats.iotlb_hits)}, > {"iotlb_misses", offsetof(struct vhost_virtqueue, > stats.iotlb_misses)}, > {"inflight_submitted", offsetof(struct vhost_virtqueue, > stats.inflight_submitted)}, > @@ -1467,6 +1471,40 @@ rte_vhost_enable_guest_notification(int vid, uint16_t > queue_id, int enable) > return ret; > } > > +void > +rte_vhost_notify_guest(int vid, uint16_t queue_id) > +{ > + struct virtio_net *dev = get_device(vid); > + struct vhost_virtqueue *vq; > + > + if (!dev || queue_id >= VHOST_MAX_VRING) > + return; > + > + vq = dev->virtqueue[queue_id]; > + if (!vq) > + return; > + > + rte_rwlock_read_lock(&vq->access_lock); > + > + if (vq->callfd >= 0) { > + int ret = eventfd_write(vq->callfd, (eventfd_t)1); > + > + if (ret) { > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + > __atomic_fetch_add(&vq->stats.guest_notifications_error, > + 1, __ATOMIC_RELAXED); > + } else { > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + > __atomic_fetch_add(&vq->stats.guest_notifications, > + 1, __ATOMIC_RELAXED); > + if (dev->notify_ops->guest_notified) > + dev->notify_ops->guest_notified(dev->vid); > + } > + } > + > + rte_rwlock_read_unlock(&vq->access_lock); > +} > + > void > rte_vhost_log_write(int vid, uint64_t addr, uint64_t len) > { > diff --git a/lib/vhost/vhost.h b/lib/vhost/vhost.h > index 37609c7c8d..0ab75b78b5 100644 > --- a/lib/vhost/vhost.h > +++ b/lib/vhost/vhost.h > @@ -141,6 +141,8 @@ struct virtqueue_stats { > uint64_t inflight_completed; > /* Counters below are atomic, and should be incremented as such. */ > uint64_t guest_notifications; > + uint64_t guest_notifications_offloaded; > + uint64_t guest_notifications_error; > }; > > /** > @@ -884,6 +886,35 @@ vhost_need_event(uint16_t event_idx, uint16_t new_idx, > uint16_t old) > return (uint16_t)(new_idx - event_idx - 1) < (uint16_t)(new_idx - > old); > } > > +static __rte_always_inline void > +vhost_vring_inject_irq(struct virtio_net *dev, struct vhost_virtqueue *vq) > +{ > + int ret; > + > + if (dev->notify_ops->guest_notify && > + dev->notify_ops->guest_notify(dev->vid, vq->index)) { > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + > __atomic_fetch_add(&vq->stats.guest_notifications_offloaded, > + 1, __ATOMIC_RELAXED); > + return; > + } > + > + ret = eventfd_write(vq->callfd, (eventfd_t) 1); > + if (ret) { > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + > __atomic_fetch_add(&vq->stats.guest_notifications_error, > + 1, __ATOMIC_RELAXED); > + return; > + } > + > + if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > + __atomic_fetch_add(&vq->stats.guest_notifications, > + 1, __ATOMIC_RELAXED); > + if (dev->notify_ops->guest_notified) > + dev->notify_ops->guest_notified(dev->vid); > +} > + > + One empty line is enough. > static __rte_always_inline void > vhost_vring_call_split(struct virtio_net *dev, struct vhost_virtqueue *vq) > { > @@ -903,26 +934,16 @@ vhost_vring_call_split(struct virtio_net *dev, struct > vhost_virtqueue *vq) > "%s: used_event_idx=%d, old=%d, new=%d\n", > __func__, vhost_used_event(vq), old, new); > > - if ((vhost_need_event(vhost_used_event(vq), new, old) && > - (vq->callfd >= 0)) || > - unlikely(!signalled_used_valid)) { > - eventfd_write(vq->callfd, (eventfd_t) 1); > - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > - > __atomic_fetch_add(&vq->stats.guest_notifications, > - 1, __ATOMIC_RELAXED); > - if (dev->notify_ops->guest_notified) > - dev->notify_ops->guest_notified(dev->vid); > + if ((vhost_need_event(vhost_used_event(vq), new, old) || > + unlikely(!signalled_used_valid)) && A bit hard to read (even before your change). After the change, indentation does not comply with dpdk coding style. http://doc.dpdk.org/guides/contributing/coding_style.html#c-indentation But putting indentation aside, is this change equivalent? - if ((vhost_need_event(vhost_used_event(vq), new, old) && - (vq->callfd >= 0)) || - unlikely(!signalled_used_valid)) { + if ((vhost_need_event(vhost_used_event(vq), new, old) || + unlikely(!signalled_used_valid)) && + vq->callfd >= 0) { > + vhost_vring_inject_irq(dev, vq); > } > } else { > /* Kick the guest if necessary. */ > if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT) > && (vq->callfd >= 0)) { > - eventfd_write(vq->callfd, (eventfd_t)1); > - if (dev->flags & VIRTIO_DEV_STATS_ENABLED) > - > __atomic_fetch_add(&vq->stats.guest_notifications, > - 1, __ATOMIC_RELAXED); > - if (dev->notify_ops->guest_notified) > - dev->notify_ops->guest_notified(dev->vid); > + vhost_vring_inject_irq(dev, vq); > } > } > } > @@ -974,11 +995,8 @@ vhost_vring_call_packed(struct virtio_net *dev, struct > vhost_virtqueue *vq) > if (vhost_need_event(off, new, old)) > kick = true; > kick: > - if (kick) { > - eventfd_write(vq->callfd, (eventfd_t)1); > - if (dev->notify_ops->guest_notified) > - dev->notify_ops->guest_notified(dev->vid); > - } > + if (kick && vq->callfd >= 0) > + vhost_vring_inject_irq(dev, vq); > } > > static __rte_always_inline void > @@ -1017,4 +1035,11 @@ mbuf_is_consumed(struct rte_mbuf *m) > > uint64_t hua_to_alignment(struct rte_vhost_memory *mem, void *ptr); > void mem_set_dump(void *ptr, size_t size, bool enable, uint64_t alignment); > + > +/* Versioned functions */ > +int rte_vhost_driver_callback_register_v23(const char *path, > + struct rte_vhost_device_ops const * const ops); > +int rte_vhost_driver_callback_register_v24(const char *path, > + struct rte_vhost_device_ops const * const ops); > + > #endif /* _VHOST_NET_CDEV_H_ */ > -- David Marchand