Shivam Kumar <shivam.kum...@nutanix.com> writes: > Even if a live migration fails due to some reason, migration status > should not be set to MIGRATION_STATUS_FAILED until migrate fd cleanup > is done, else the client can trigger another instance of migration > before the cleanup is complete (as it would assume no migration is > active) or reset migration capabilities affecting old migration's > cleanup. Hence, set the status to 'failing' when a migration failure > happens and once the cleanup is complete, set the migration status to > MIGRATION_STATUS_FAILED. > > Signed-off-by: Shivam Kumar <shivam.kum...@nutanix.com> > --- > migration/migration.c | 49 +++++++++++++++++++++---------------------- > migration/migration.h | 9 ++++++++ > migration/multifd.c | 6 ++---- > migration/savevm.c | 7 +++---- > 4 files changed, 38 insertions(+), 33 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index df61ca4e93..f084f54f6b 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -1143,8 +1143,9 @@ static bool migration_is_active(void)
migration_is_running() is the one that gates qmp_migrate() and qmp_migrate_set_capabilities(). > { > MigrationState *s = current_migration; > > - return (s->state == MIGRATION_STATUS_ACTIVE || > - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); > + return ((s->state == MIGRATION_STATUS_ACTIVE || > + s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) && > + !qatomic_read(&s->failing)); > } > > static bool migrate_show_downtime(MigrationState *s) > @@ -1439,6 +1440,11 @@ static void migrate_fd_cleanup(MigrationState *s) > MIGRATION_STATUS_CANCELLED); > } > > + if (qatomic_xchg(&s->failing, 0)) { > + migrate_set_state(&s->state, s->state, > + MIGRATION_STATUS_FAILED); > + } I hope you've verified that sure every place that used to set FAILED will also reach migrate_fd_cleanup() eventually. Also, we probably still need the FAILING state. Otherwise, this will trip code that expects a state change on failure. Anything that does: if (state != MIGRATION_STATUS_FOO) { ... } So I think the change overall should be -migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); +migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING); void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, MigrationStatus new_state) { assert(new_state < MIGRATION_STATUS__MAX); if (qatomic_cmpxchg(state, old_state, new_state) == old_state) { trace_migrate_set_state(MigrationStatus_str(new_state)); + if (new_state == MIGRATION_STATUS_FAILING) { + qatomic_set(&s->failing, 1); + } migrate_generate_event(new_state); } } And we should proably do the same for CANCELLING actually, but there the (preexisting) issue is actual concurrency, while here it's just inconsistency in the state. > + > if (s->error) { > /* It is used on info migrate. We can't free it */ > error_report_err(error_copy(s->error)); > @@ -1484,17 +1490,16 @@ static void migrate_error_free(MigrationState *s) > static void migrate_fd_error(MigrationState *s, const Error *error) > { > MigrationStatus current = s->state; > - MigrationStatus next; > > assert(s->to_dst_file == NULL); > > switch (current) { > case MIGRATION_STATUS_SETUP: > - next = MIGRATION_STATUS_FAILED; > + qatomic_set(&s->failing, 1); > break; > case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: > /* Never fail a postcopy migration; switch back to PAUSED instead */ > - next = MIGRATION_STATUS_POSTCOPY_PAUSED; > + migrate_set_state(&s->state, current, > MIGRATION_STATUS_POSTCOPY_PAUSED); > break; > default: > /* > @@ -1506,7 +1511,6 @@ static void migrate_fd_error(MigrationState *s, const > Error *error) > return; > } > > - migrate_set_state(&s->state, current, next); > migrate_set_error(s, error); > } > > @@ -2101,8 +2105,7 @@ void qmp_migrate(const char *uri, bool has_channels, > } else { > error_setg(&local_err, QERR_INVALID_PARAMETER_VALUE, "uri", > "a valid migration protocol"); > - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > if (local_err) { > @@ -2498,7 +2501,7 @@ static int postcopy_start(MigrationState *ms, Error > **errp) > if (migrate_postcopy_preempt()) { > migration_wait_main_channel(ms); > if (postcopy_preempt_establish_channel(ms)) { > - migrate_set_state(&ms->state, ms->state, > MIGRATION_STATUS_FAILED); > + qatomic_set(&ms->failing, 1); > error_setg(errp, "%s: Failed to establish preempt channel", > __func__); > return -1; > @@ -2648,8 +2651,7 @@ static int postcopy_start(MigrationState *ms, Error > **errp) > fail_closefb: > qemu_fclose(fb); > fail: > - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&ms->failing, 1); > if (restart_block) { > /* A failure happened early enough that we know the destination > hasn't > * accessed block devices, so we're safe to recover. > @@ -2782,8 +2784,7 @@ static void migration_completion_failed(MigrationState > *s, > bql_unlock(); > } > > - migrate_set_state(&s->state, current_active_state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > /** > @@ -2850,8 +2851,6 @@ fail: > */ > static void bg_migration_completion(MigrationState *s) > { > - int current_active_state = s->state; > - > if (s->state == MIGRATION_STATUS_ACTIVE) { > /* > * By this moment we have RAM content saved into the migration > stream. > @@ -2874,8 +2873,7 @@ static void bg_migration_completion(MigrationState *s) > return; > > fail: > - migrate_set_state(&s->state, current_active_state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > > typedef enum MigThrError { > @@ -3047,6 +3045,10 @@ static MigThrError > migration_detect_error(MigrationState *s) > return MIG_THR_ERR_FATAL; > } > > + if (qatomic_read(&s->failing)) { > + return MIG_THR_ERR_FATAL; > + } > + > /* > * Try to detect any file errors. Note that postcopy_qemufile_src will > * be NULL when postcopy preempt is not enabled. > @@ -3077,7 +3079,7 @@ static MigThrError > migration_detect_error(MigrationState *s) > * For precopy (or postcopy with error outside IO), we fail > * with no time. > */ > - migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > trace_migration_thread_file_err(); > > /* Time to stop the migration, now. */ > @@ -3492,8 +3494,7 @@ static void *migration_thread(void *opaque) > if (ret) { > migrate_set_error(s, local_err); > error_free(local_err); > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > goto out; > } > > @@ -3617,8 +3618,7 @@ static void *bg_migration_thread(void *opaque) > if (ret) { > migrate_set_error(s, local_err); > error_free(local_err); > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > goto fail_setup; > } > > @@ -3685,8 +3685,7 @@ static void *bg_migration_thread(void *opaque) > > fail: > if (early_fail) { > - migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > bql_unlock(); > } > > @@ -3805,7 +3804,7 @@ void migrate_fd_connect(MigrationState *s, Error > *error_in) > > fail: > migrate_set_error(s, local_err); > - migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > error_report_err(local_err); > migrate_fd_cleanup(s); > } > diff --git a/migration/migration.h b/migration/migration.h > index 7b6e718690..3b808d971f 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -471,6 +471,15 @@ struct MigrationState { > bool switchover_acked; > /* Is this a rdma migration */ > bool rdma_migration; > + /* > + * Is migration failing? Migration status should not be set to > + * MIGRATION_STATUS_FAILED until migrate fd cleanup is done, else > + * the client can trigger another instance of migration before > + * the cleanup is complete. Hence, set the status to 'failing' > + * when a migration failure happens and once the cleanup is done, > + * set it to MIGRATION_STATUS_FAILED. > + */ > + int failing; > }; > > void migrate_set_state(MigrationStatus *state, MigrationStatus old_state, > diff --git a/migration/multifd.c b/migration/multifd.c > index 4f973d70e0..48ece2162c 100644 > --- a/migration/multifd.c > +++ b/migration/multifd.c > @@ -878,8 +878,7 @@ bool multifd_send_setup(void) > return true; > > err: > - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > return false; > } > > @@ -949,8 +948,7 @@ static void multifd_recv_terminate_threads(Error *err) > migrate_set_error(s, err); > if (s->state == MIGRATION_STATUS_SETUP || > s->state == MIGRATION_STATUS_ACTIVE) { > - migrate_set_state(&s->state, s->state, > - MIGRATION_STATUS_FAILED); > + qatomic_set(&s->failing, 1); > } > } > > diff --git a/migration/savevm.c b/migration/savevm.c > index 927b1146c0..4f0ef34f23 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1686,7 +1686,6 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp) > { > int ret; > MigrationState *ms = migrate_get_current(); > - MigrationStatus status; > > if (migration_is_running()) { > error_setg(errp, "There's a migration process in progress"); > @@ -1723,11 +1722,11 @@ cleanup: > qemu_savevm_state_cleanup(); > > if (ret != 0) { > - status = MIGRATION_STATUS_FAILED; > + qatomic_set(&ms->failing, 1); > } else { > - status = MIGRATION_STATUS_COMPLETED; > + migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, > + MIGRATION_STATUS_COMPLETED); > } > - migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status); > > /* f is outer parameter, it should not stay in global migration state > after > * this function finished */