On Fri, Mar 15, 2024 at 03:21:27PM +0100, Cédric Le Goater wrote: > On 3/15/24 13:20, Cédric Le Goater wrote: > > On 3/15/24 12:01, Peter Xu wrote: > > > On Fri, Mar 15, 2024 at 11:17:45AM +0100, Cédric Le Goater wrote: > > > > > migrate_set_state is also unintuitive because it ignores invalid state > > > > > transitions and we've been using that property to deal with special > > > > > states such as POSTCOPY_PAUSED and FAILED: > > > > > > > > > > - After the migration goes into POSTCOPY_PAUSED, the resumed > > > > > migration's > > > > > migrate_init() will try to set the state NONE->SETUP, which is not > > > > > valid. > > > > > > > > > > - After save_setup fails, the migration goes into FAILED, but > > > > > wait_unplug > > > > > will try to transition SETUP->ACTIVE, which is also not valid. > > > > > > > > > > > > > I am not sure I understand what the plan is. Both solutions are > > > > problematic > > > > regarding the state transitions. > > > > > > > > Should we consider that waiting for failover devices to unplug is an > > > > internal > > > > step of the SETUP phase not transitioning to ACTIVE ? > > > > > > If to unblock this series, IIUC the simplest solution is to do what > > > Fabiano > > > suggested, that we move qemu_savevm_wait_unplug() to be before the check > > > of > > > setup() ret. > > > > The simplest is IMHO moving qemu_savevm_wait_unplug() before > > qemu_savevm_state_setup() and leave patch 10 is unchanged. See > > below the extra patch. It looks much cleaner than what we have > > today. > > > > > In that case, the state change in qemu_savevm_wait_unplug() > > > should be benign and we should see a super small window it became ACTIVE > > > but then it should be FAILED (and IIUC the patch itself will need to use > > > ACTIVE as "old_state", not SETUP anymore). > > > > OK. I will give it a try to compare. > > Here's the alternative solution. SETUP state failures are handled after > transitioning to ACTIVE state, which is unfortunate but probably harmless. > I guess it's OK.
This also looks good to me, thanks. One trivial early comment is in this case we can introduce a helper to cover both setup() calls and UNPLUG waits and dedup the two paths. > > Thanks, > > C. > > > > Signed-off-by: Cédric Le Goater <c...@redhat.com> > --- > migration/savevm.h | 2 +- > migration/migration.c | 29 +++++++++++++++++++++++++++-- > migration/savevm.c | 26 +++++++++++++++----------- > 3 files changed, 43 insertions(+), 14 deletions(-) > > diff --git a/migration/savevm.h b/migration/savevm.h > index > 74669733dd63a080b765866c703234a5c4939223..9ec96a995c93a42aad621595f0ed58596c532328 > 100644 > --- a/migration/savevm.h > +++ b/migration/savevm.h > @@ -32,7 +32,7 @@ > bool qemu_savevm_state_blocked(Error **errp); > void qemu_savevm_non_migratable_list(strList **reasons); > int qemu_savevm_state_prepare(Error **errp); > -void qemu_savevm_state_setup(QEMUFile *f); > +int qemu_savevm_state_setup(QEMUFile *f, Error **errp); > bool qemu_savevm_state_guest_unplug_pending(void); > int qemu_savevm_state_resume_prepare(MigrationState *s); > void qemu_savevm_state_header(QEMUFile *f); > diff --git a/migration/migration.c b/migration/migration.c > index > 644e073b7dcc70cb2bdaa9c975ba478952465ff4..0704ad6226df61f2f15bd81a2897f9946d601ca7 > 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3427,6 +3427,8 @@ static void *migration_thread(void *opaque) > int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); > MigThrError thr_error; > bool urgent = false; > + Error *local_err = NULL; > + int ret; > thread = migration_threads_add("live_migration", qemu_get_thread_id()); > @@ -3470,12 +3472,24 @@ static void *migration_thread(void *opaque) > } > bql_lock(); > - qemu_savevm_state_setup(s->to_dst_file); > + ret = qemu_savevm_state_setup(s->to_dst_file, &local_err); > bql_unlock(); > qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, > MIGRATION_STATUS_ACTIVE); > + /* > + * Handle SETUP failures after waiting for virtio-net-failover > + * devices to unplug. This to preserve migration state transitions. > + */ > + if (ret) { > + migrate_set_error(s, local_err); > + error_free(local_err); > + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > + MIGRATION_STATUS_FAILED); > + goto out; > + } > + > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > trace_migration_thread_setup_complete(); > @@ -3549,6 +3563,8 @@ static void *bg_migration_thread(void *opaque) > MigThrError thr_error; > QEMUFile *fb; > bool early_fail = true; > + Error *local_err = NULL; > + int ret; > rcu_register_thread(); > object_ref(OBJECT(s)); > @@ -3582,12 +3598,20 @@ static void *bg_migration_thread(void *opaque) > bql_lock(); > qemu_savevm_state_header(s->to_dst_file); > - qemu_savevm_state_setup(s->to_dst_file); > + ret = qemu_savevm_state_setup(s->to_dst_file, &local_err); > bql_unlock(); > qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP, > MIGRATION_STATUS_ACTIVE); > + if (ret) { > + migrate_set_error(s, local_err); > + error_free(local_err); > + migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > + MIGRATION_STATUS_FAILED); > + goto fail_setup; > + } > + > s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; > trace_migration_thread_setup_complete(); > @@ -3656,6 +3680,7 @@ fail: > bql_unlock(); > } > +fail_setup: > bg_migration_iteration_finish(s); > qemu_fclose(fb); > diff --git a/migration/savevm.c b/migration/savevm.c > index > 1a7b5cb78a912c36ae16db703afc90ef2906b61f..0eb94e61f888adba2c0732c2cb701b110814c455 > 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1310,11 +1310,11 @@ int qemu_savevm_state_prepare(Error **errp) > return 0; > } > -void qemu_savevm_state_setup(QEMUFile *f) > +int qemu_savevm_state_setup(QEMUFile *f, Error **errp) > { > + ERRP_GUARD(); > MigrationState *ms = migrate_get_current(); > SaveStateEntry *se; > - Error *local_err = NULL; > int ret = 0; > json_writer_int64(ms->vmdesc, "page_size", qemu_target_page_size()); > @@ -1323,10 +1323,9 @@ void qemu_savevm_state_setup(QEMUFile *f) > trace_savevm_state_setup(); > QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { > if (se->vmsd && se->vmsd->early_setup) { > - ret = vmstate_save(f, se, ms->vmdesc, &local_err); > + ret = vmstate_save(f, se, ms->vmdesc, errp); > if (ret) { > - migrate_set_error(ms, local_err); > - error_report_err(local_err); > + migrate_set_error(ms, *errp); > qemu_file_set_error(f, ret); > break; > } > @@ -1346,18 +1345,19 @@ void qemu_savevm_state_setup(QEMUFile *f) > ret = se->ops->save_setup(f, se->opaque); > save_section_footer(f, se); > if (ret < 0) { > + error_setg(errp, "failed to setup SaveStateEntry with id(name): " > + "%d(%s): %d", se->section_id, se->idstr, ret); > qemu_file_set_error(f, ret); > break; > } > } > if (ret) { > - return; > + return ret; > } > - if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) { > - error_report_err(local_err); > - } > + /* TODO: Should we check that errp is set in case of failure ? */ > + return precopy_notify(PRECOPY_NOTIFY_SETUP, errp); > } > int qemu_savevm_state_resume_prepare(MigrationState *s) > @@ -1725,7 +1725,10 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) > ms->to_dst_file = f; > qemu_savevm_state_header(f); > - qemu_savevm_state_setup(f); > + ret = qemu_savevm_state_setup(f, errp); > + if (ret) { > + goto cleanup; > + } > while (qemu_file_get_error(f) == 0) { > if (qemu_savevm_state_iterate(f, false) > 0) { > @@ -1738,10 +1741,11 @@ static int qemu_savevm_state(QEMUFile *f, Error > **errp) > qemu_savevm_state_complete_precopy(f, false, false); > ret = qemu_file_get_error(f); > } > - qemu_savevm_state_cleanup(); > if (ret != 0) { > error_setg_errno(errp, -ret, "Error while writing VM state"); > } > +cleanup: > + qemu_savevm_state_cleanup(); > if (ret != 0) { > status = MIGRATION_STATUS_FAILED; > -- > 2.44.0 > > -- Peter Xu