Stefano Garzarella <sgarz...@redhat.com> 于2024年10月10日周四 16:20写道:
>
> On Thu, Oct 10, 2024 at 02:18:24PM GMT, yaozhenguo wrote:
> >During the hot-unplugging of vhost-user-net type network cards,
> >the vhost_user_cleanup function may add the same rcu node to
> >the rcu linked list.
> >The function call relationship in this case is as follows:
> >
> >vhost_user_cleanup
> > ->vhost_user_host_notifier_remove
> > ->call_rcu(n, vhost_user_host_notifier_free, rcu);
> > ->g_free_rcu(n, rcu);
> >
> >When this happens, QEMU will abort in try_dequeue:
> >
> >if (head == &dummy && qatomic_mb_read(&tail) == &dummy.next) {
> > abort();
> >}
> >
> >backtrace is as follows::
> >0 __pthread_kill_implementation () at /usr/lib64/libc.so.6
> >1 raise () at /usr/lib64/libc.so.6
> >2 abort () at /usr/lib64/libc.so.6
> >3 try_dequeue () at ../util/rcu.c:235
> >4 call_rcu_thread (0) at ../util/rcu.c:288
> >5 qemu_thread_start (0) at ../util/qemu-thread-posix.c:541
> >6 start_thread () at /usr/lib64/libc.so.6
> >7 clone3 () at /usr/lib64/libc.so.6
> >
> >The reason for the abort is that adding two identical nodes to
> >the rcu linked list will cause the rcu linked list to become a ring,
> >but when the dummy node is added after the two identical nodes,
> >the ring is opened. But only one node is added to list with
> >rcu_call_count added twice. This will cause rcu try_dequeue abort.
> >
> >This happens when n->addr != 0. In some scenarios, this does happen.
> >For example, this situation will occur when using a 32-queue DPU
> >vhost-user-net type network card for hot-unplug testing, because
> >VhostUserHostNotifier->addr will be cleared during the processing of
> >VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG. However,it is asynchronous,
> >so we cannot guarantee that VhostUserHostNotifier->addr is zero in
> >vhost_user_cleanup. Therefore, it is necessary to merge g_free_rcu
> >and vhost_user_host_notifier_free into one rcu node.
> >
> >Fixes: 503e355465 ("virtio/vhost-user: dynamically assign
> >VhostUserHostNotifiers")
> >Signed-off-by: yaozhenguo <yaozhen...@jd.com>
> >---
> > V1->V2: add n->addr check in vhost_user_get_vring_base and
> > vhost_user_backend_handle_vring_host_notifier to prevent submit same node
> > to rcu list.
> > V2->V3: 1. change "free" to "destroy"
> > 2. move "!n->addr && !destroy" checking to
> > vhost_user_host_notifier_remove
> > 3. move "!n" checking to vhost_user_host_notifier_remove
> >---
> > hw/virtio/vhost-user.c | 43 ++++++++++++++++++----------------
> > include/hw/virtio/vhost-user.h | 1 +
> > 2 files changed, 24 insertions(+), 20 deletions(-)
>
> Some checkpatch errors:
>
> $ ./scripts/checkpatch.pl --strict --branch master..HEAD --codespell
> ERROR: braces {} are necessary for all arms of this statement
> #98: FILE: hw/virtio/vhost-user.c:1207:
> + if (!n)
> [...]
>
> ERROR: braces {} are necessary for all arms of this statement
> #100: FILE: hw/virtio/vhost-user.c:1209:
> + if (!destroy && !n->addr)
> [...]
>
> total: 2 errors, 0 warnings, 96 lines checked
>
I will fix it later.
> >
> >diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
> >index 00561daa06..f80d0af76f 100644
> >--- a/hw/virtio/vhost-user.c
> >+++ b/hw/virtio/vhost-user.c
> >@@ -1185,9 +1185,16 @@ static int vhost_user_set_vring_num(struct vhost_dev
> >*dev,
> >
> > static void vhost_user_host_notifier_free(VhostUserHostNotifier *n)
> > {
> >- assert(n && n->unmap_addr);
> >- munmap(n->unmap_addr, qemu_real_host_page_size());
> >- n->unmap_addr = NULL;
> >+ if (n->unmap_addr) {
> >+ munmap(n->unmap_addr, qemu_real_host_page_size());
> >+ n->unmap_addr = NULL;
> >+ }
> >+ if (n->destroy) {
> >+ memory_region_transaction_begin();
> >+ object_unparent(OBJECT(&n->mr));
> >+ memory_region_transaction_commit();
> >+ g_free(n);
> >+ }
> > }
> >
> > /*
> >@@ -1195,17 +1202,25 @@ static void
> >vhost_user_host_notifier_free(VhostUserHostNotifier *n)
> > * under rcu.
> > */
> > static void vhost_user_host_notifier_remove(VhostUserHostNotifier *n,
> >- VirtIODevice *vdev)
> >+ VirtIODevice *vdev, bool
> >destroy)
> > {
> >+ if (!n)
> >+ return;
> >+ if (!destroy && !n->addr)
>
> IIUC if `destroy` is false and `n->addr` is NULL, we don't have anything
> to do, so we can early return, right?
>
Yes.
> Maybe we can put a comment on that condition.
>
> >+ return;
>
> Better to use a single if here in this way:
> if (!n || (!destroy && !n->addr)) {
> return;
> }
>
Got it!
> The rest LGTM!
>
> Thanks,
> Stefano
>
> >+
> > if (n->addr) {
> > if (vdev) {
> >+ memory_region_transaction_begin();
> > virtio_queue_set_host_notifier_mr(vdev, n->idx, &n->mr, false);
> >+ memory_region_transaction_commit();
> > }
> > assert(!n->unmap_addr);
> > n->unmap_addr = n->addr;
> > n->addr = NULL;
> >- call_rcu(n, vhost_user_host_notifier_free, rcu);
> > }
> >+ n->destroy = destroy;
> >+ call_rcu(n, vhost_user_host_notifier_free, rcu);
> > }
> >
> > static int vhost_user_set_vring_base(struct vhost_dev *dev,
> >@@ -1279,9 +1294,7 @@ static int vhost_user_get_vring_base(struct vhost_dev
> >*dev,
> > struct vhost_user *u = dev->opaque;
> >
> > VhostUserHostNotifier *n = fetch_notifier(u->user, ring->index);
> >- if (n) {
> >- vhost_user_host_notifier_remove(n, dev->vdev);
> >- }
> >+ vhost_user_host_notifier_remove(n, dev->vdev, false);
> >
> > ret = vhost_user_write(dev, &msg, NULL, 0);
> > if (ret < 0) {
> >@@ -1562,7 +1575,7 @@ static int
> >vhost_user_backend_handle_vring_host_notifier(struct vhost_dev *dev,
> > * new mapped address.
> > */
> > n = fetch_or_create_notifier(user, queue_idx);
> >- vhost_user_host_notifier_remove(n, vdev);
> >+ vhost_user_host_notifier_remove(n, vdev, false);
> >
> > if (area->u64 & VHOST_USER_VRING_NOFD_MASK) {
> > return 0;
> >@@ -2736,15 +2749,7 @@ static int vhost_user_set_inflight_fd(struct
> >vhost_dev *dev,
> > static void vhost_user_state_destroy(gpointer data)
> > {
> > VhostUserHostNotifier *n = (VhostUserHostNotifier *) data;
> >- if (n) {
> >- vhost_user_host_notifier_remove(n, NULL);
> >- object_unparent(OBJECT(&n->mr));
> >- /*
> >- * We can't free until vhost_user_host_notifier_remove has
> >- * done it's thing so schedule the free with RCU.
> >- */
> >- g_free_rcu(n, rcu);
> >- }
> >+ vhost_user_host_notifier_remove(n, NULL, true);
> > }
> >
> > bool vhost_user_init(VhostUserState *user, CharBackend *chr, Error **errp)
> >@@ -2765,9 +2770,7 @@ void vhost_user_cleanup(VhostUserState *user)
> > if (!user->chr) {
> > return;
> > }
> >- memory_region_transaction_begin();
> > user->notifiers = (GPtrArray *) g_ptr_array_free(user->notifiers, true);
> >- memory_region_transaction_commit();
> > user->chr = NULL;
> > }
> >
> >diff --git a/include/hw/virtio/vhost-user.h b/include/hw/virtio/vhost-user.h
> >index 324cd8663a..9a3f238b43 100644
> >--- a/include/hw/virtio/vhost-user.h
> >+++ b/include/hw/virtio/vhost-user.h
> >@@ -54,6 +54,7 @@ typedef struct VhostUserHostNotifier {
> > void *addr;
> > void *unmap_addr;
> > int idx;
> >+ bool destroy;
> > } VhostUserHostNotifier;
> >
> > /**
> >--
> >2.41.0
> >
>