I'll let others chime in on higher level device model changes you're making. For the vhost-user-blk and vhost-user-scsi bits:
On Sun, Mar 9, 2025 at 11:41 AM Haoqian He <haoqian...@smartx.com> wrote: > > Live migration should be terminated if backend crash. nit: "...if the backend crashes before the migration completes." > > Since the vhost device will be stopped when VM is stopped before > the end of the live migration, current implementation if vhost > backend died, vhost device's set_status() will not return failure, > live migration won't perceive the disconnection between qemu and > vhost backend, inflight io would be submitted in migration target > host, leading to IO error. > > To fix this issue: > 1. Add set_status_ext() which has return value for > VirtioDeviceClass and vhost-user-blk/scsi use the _ext version. > > 2. In set_status_ext(), return failure if the flag `connected` > is false or vhost_dev_stop return failure, which means qemu lost > connection with backend. I have some concerns with [2]. See my later responses. > > Hence migration_completion() will process failure, terminate > migration and restore VM. > > Signed-off-by: Haoqian He <haoqian...@smartx.com> > --- > hw/block/vhost-user-blk.c | 29 +++++++++++++++------------ > hw/scsi/vhost-scsi-common.c | 11 +++++----- > hw/scsi/vhost-user-scsi.c | 20 ++++++++++-------- > hw/virtio/virtio.c | 20 +++++++++++++----- > include/hw/virtio/vhost-scsi-common.h | 2 +- > include/hw/virtio/virtio.h | 1 + > 6 files changed, 51 insertions(+), 32 deletions(-) > > diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c > index ae42327cf8..4865786c54 100644 > --- a/hw/block/vhost-user-blk.c > +++ b/hw/block/vhost-user-blk.c > @@ -204,7 +204,7 @@ err_host_notifiers: > return ret; > } > > -static void vhost_user_blk_stop(VirtIODevice *vdev) > +static int vhost_user_blk_stop(VirtIODevice *vdev) > { > VHostUserBlk *s = VHOST_USER_BLK(vdev); > BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > @@ -212,26 +212,26 @@ static void vhost_user_blk_stop(VirtIODevice *vdev) > int ret; > > if (!s->started_vu) { > - return; > + return 0; > } > s->started_vu = false; > > if (!k->set_guest_notifiers) { > - return; > + return 0; > } > > - vhost_dev_stop(&s->dev, vdev, true); > + ret = vhost_dev_stop(&s->dev, vdev, true); > > - ret = k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false); > - if (ret < 0) { > + if (k->set_guest_notifiers(qbus->parent, s->dev.nvqs, false) < 0) { > error_report("vhost guest notifier cleanup failed: %d", ret); > - return; > + return -1; > } > > vhost_dev_disable_notifiers(&s->dev, vdev); > + return ret; > } > > -static void vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) > +static int vhost_user_blk_set_status(VirtIODevice *vdev, uint8_t status) > { > VHostUserBlk *s = VHOST_USER_BLK(vdev); > bool should_start = virtio_device_should_start(vdev, status); > @@ -239,11 +239,11 @@ static void vhost_user_blk_set_status(VirtIODevice > *vdev, uint8_t status) > int ret; > Do we want to fail out in all cases where vhost_user_blk_set_status() and the device is not connected? Could this impact power-on if the backend is temporarily down? > if (!s->connected) { > - return; > + return -1; > } > > if (vhost_dev_is_started(&s->dev) == should_start) { > - return; > + return 0; > } > > if (should_start) { > @@ -253,9 +253,12 @@ static void vhost_user_blk_set_status(VirtIODevice > *vdev, uint8_t status) > qemu_chr_fe_disconnect(&s->chardev); > } > } else { > - vhost_user_blk_stop(vdev); > + ret = vhost_user_blk_stop(vdev); > + if (ret < 0) { > + return ret; > + } > } > - > + return 0; > } > > static uint64_t vhost_user_blk_get_features(VirtIODevice *vdev, > @@ -597,7 +600,7 @@ static void vhost_user_blk_class_init(ObjectClass *klass, > void *data) > vdc->get_config = vhost_user_blk_update_config; > vdc->set_config = vhost_user_blk_set_config; > vdc->get_features = vhost_user_blk_get_features; > - vdc->set_status = vhost_user_blk_set_status; > + vdc->set_status_ext = vhost_user_blk_set_status; > vdc->reset = vhost_user_blk_reset; > vdc->get_vhost = vhost_user_blk_get_vhost; > } > diff --git a/hw/scsi/vhost-scsi-common.c b/hw/scsi/vhost-scsi-common.c > index 4c8637045d..bfeed0cb1b 100644 > --- a/hw/scsi/vhost-scsi-common.c > +++ b/hw/scsi/vhost-scsi-common.c > @@ -101,24 +101,25 @@ err_host_notifiers: > return ret; > } > > -void vhost_scsi_common_stop(VHostSCSICommon *vsc) > +int vhost_scsi_common_stop(VHostSCSICommon *vsc) > { > VirtIODevice *vdev = VIRTIO_DEVICE(vsc); > BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); > VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); > int ret = 0; > > - vhost_dev_stop(&vsc->dev, vdev, true); > + ret = vhost_dev_stop(&vsc->dev, vdev, true); > > if (k->set_guest_notifiers) { > - ret = k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false); > - if (ret < 0) { > + int r = k->set_guest_notifiers(qbus->parent, vsc->dev.nvqs, false); > + if (r < 0) { > error_report("vhost guest notifier cleanup failed: %d", ret); > } Now that we return back a value the assert here should probably be dropped? > + assert(r >= 0); > } > - assert(ret >= 0); > > vhost_dev_disable_notifiers(&vsc->dev, vdev); > + return ret; > } > > uint64_t vhost_scsi_common_get_features(VirtIODevice *vdev, uint64_t > features, > diff --git a/hw/scsi/vhost-user-scsi.c b/hw/scsi/vhost-user-scsi.c > index adb41b9816..8e7efc38f2 100644 > --- a/hw/scsi/vhost-user-scsi.c > +++ b/hw/scsi/vhost-user-scsi.c > @@ -52,19 +52,19 @@ static int vhost_user_scsi_start(VHostUserSCSI *s, Error > **errp) > return ret; > } > > -static void vhost_user_scsi_stop(VHostUserSCSI *s) > +static int vhost_user_scsi_stop(VHostUserSCSI *s) > { > VHostSCSICommon *vsc = VHOST_SCSI_COMMON(s); > > if (!s->started_vu) { > - return; > + return 0; > } > s->started_vu = false; > > - vhost_scsi_common_stop(vsc); > + return vhost_scsi_common_stop(vsc); > } > > -static void vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) > +static int vhost_user_scsi_set_status(VirtIODevice *vdev, uint8_t status) > { > VHostUserSCSI *s = (VHostUserSCSI *)vdev; > DeviceState *dev = DEVICE(vdev); > @@ -75,11 +75,11 @@ static void vhost_user_scsi_set_status(VirtIODevice > *vdev, uint8_t status) > int ret; > > if (!s->connected) { > - return; > + return -1; > } > > if (vhost_dev_is_started(&vsc->dev) == should_start) { > - return; > + return 0; > } > > if (should_start) { > @@ -91,8 +91,12 @@ static void vhost_user_scsi_set_status(VirtIODevice *vdev, > uint8_t status) > qemu_chr_fe_disconnect(&vs->conf.chardev); > } > } else { > - vhost_user_scsi_stop(s); > + ret = vhost_user_scsi_stop(s); > + if (ret) { > + return ret; > + } > } > + return 0; > } > > static void vhost_user_scsi_handle_output(VirtIODevice *vdev, VirtQueue *vq) > @@ -399,7 +403,7 @@ static void vhost_user_scsi_class_init(ObjectClass > *klass, void *data) > vdc->unrealize = vhost_user_scsi_unrealize; > vdc->get_features = vhost_scsi_common_get_features; > vdc->set_config = vhost_scsi_common_set_config; > - vdc->set_status = vhost_user_scsi_set_status; > + vdc->set_status_ext = vhost_user_scsi_set_status; > fwc->get_dev_path = vhost_scsi_common_get_fw_dev_path; > vdc->reset = vhost_user_scsi_reset; > vdc->get_vhost = vhost_user_scsi_get_vhost; > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c > index 5e8d4cab53..fff7cdb35d 100644 > --- a/hw/virtio/virtio.c > +++ b/hw/virtio/virtio.c > @@ -2221,12 +2221,12 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) > { > VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); > trace_virtio_set_status(vdev, val); > + int ret = 0; > > if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { > if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) && > val & VIRTIO_CONFIG_S_FEATURES_OK) { > - int ret = virtio_validate_features(vdev); > - > + ret = virtio_validate_features(vdev); > if (ret) { > return ret; > } > @@ -2238,12 +2238,18 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) > virtio_set_started(vdev, val & VIRTIO_CONFIG_S_DRIVER_OK); > } > > - if (k->set_status) { > + if (k->set_status_ext) { > + ret = k->set_status_ext(vdev, val); > + if (ret) { > + qemu_log("set %s status to %d failed, old status: %d\n", > + vdev->name, val, vdev->status); > + } > + } else if (k->set_status) { > k->set_status(vdev, val); > } > vdev->status = val; > > - return 0; > + return ret; > } > > static enum virtio_device_endian virtio_default_endian(void) > @@ -3436,7 +3442,11 @@ static int virtio_vmstate_change(void *opaque, bool > running, RunState state) > } > > if (!backend_run) { > - virtio_set_status(vdev, vdev->status); > + // the return value was used for stopping VM during migration > + int ret = virtio_set_status(vdev, vdev->status); > + if (ret) { > + return ret; > + } > } > return 0; > } > diff --git a/include/hw/virtio/vhost-scsi-common.h > b/include/hw/virtio/vhost-scsi-common.h > index c5d2c09455..d54d9c916f 100644 > --- a/include/hw/virtio/vhost-scsi-common.h > +++ b/include/hw/virtio/vhost-scsi-common.h > @@ -40,7 +40,7 @@ struct VHostSCSICommon { > }; > > int vhost_scsi_common_start(VHostSCSICommon *vsc, Error **errp); > -void vhost_scsi_common_stop(VHostSCSICommon *vsc); > +int vhost_scsi_common_stop(VHostSCSICommon *vsc); > char *vhost_scsi_common_get_fw_dev_path(FWPathProvider *p, BusState *bus, > DeviceState *dev); > void vhost_scsi_common_set_config(VirtIODevice *vdev, const uint8_t *config); > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h > index 6386910280..c99d56f519 100644 > --- a/include/hw/virtio/virtio.h > +++ b/include/hw/virtio/virtio.h > @@ -187,6 +187,7 @@ struct VirtioDeviceClass { > void (*set_config)(VirtIODevice *vdev, const uint8_t *config); > void (*reset)(VirtIODevice *vdev); > void (*set_status)(VirtIODevice *vdev, uint8_t val); > + int (*set_status_ext)(VirtIODevice *vdev, uint8_t val); > /* Device must validate queue_index. */ > void (*queue_reset)(VirtIODevice *vdev, uint32_t queue_index); > /* Device must validate queue_index. */ > -- > 2.48.1 >