On Thu, Feb 08, 2024 at 10:54:02AM -0800, Steve Sistare wrote:
> Check the status returned by migration notifiers and report errors.
> If notifiers fail, call the notifiers again so they can clean up.
> None of the notifiers return an error status at this time.
> 
> Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
> ---
>  include/migration/misc.h |  3 ++-
>  migration/migration.c    | 40 +++++++++++++++++++++++++++++-----------
>  2 files changed, 31 insertions(+), 12 deletions(-)
> 
> diff --git a/include/migration/misc.h b/include/migration/misc.h
> index 0ea1902..6dc234b 100644
> --- a/include/migration/misc.h
> +++ b/include/migration/misc.h
> @@ -82,7 +82,8 @@ void migration_add_notifier(NotifierWithReturn *notify,
>  void migration_add_notifier_mode(NotifierWithReturn *notify,
>                                   MigrationNotifyFunc func, MigMode mode);
>  void migration_remove_notifier(NotifierWithReturn *notify);
> -void migration_call_notifiers(MigrationState *s, MigrationEventType type);
> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
> +                             Error **errp);
>  bool migration_in_setup(MigrationState *);
>  bool migration_has_finished(MigrationState *);
>  bool migration_has_failed(MigrationState *);
> diff --git a/migration/migration.c b/migration/migration.c
> index 01d8867..d1fce9e 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -1318,6 +1318,8 @@ void migrate_set_state(int *state, int old_state, int 
> new_state)
>  
>  static void migrate_fd_cleanup(MigrationState *s)
>  {
> +    Error *local_err = NULL;
> +
>      g_free(s->hostname);
>      s->hostname = NULL;
>      json_writer_free(s->vmdesc);
> @@ -1362,13 +1364,23 @@ static void migrate_fd_cleanup(MigrationState *s)
>                            MIGRATION_STATUS_CANCELLED);
>      }
>  
> +    if (!migration_has_failed(s) &&
> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_DONE, &local_err)) {
> +
> +        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
> +        migrate_set_error(s, local_err);
> +        error_free(local_err);
> +    }
> +
>      if (s->error) {
>          /* It is used on info migrate.  We can't free it */
>          error_report_err(error_copy(s->error));
>      }
> -    migration_call_notifiers(s, s->state == MIGRATION_STATUS_COMPLETED ?
> -                             MIG_EVENT_PRECOPY_DONE :
> -                             MIG_EVENT_PRECOPY_FAILED);
> +
> +    if (migration_has_failed(s)) {
> +        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
> +    }

AFAIU, the whole point of such split is, allowing DONE notifies to fail too
and then if that happens we can invoke FAIL notifiers again.

Perhaps we can avoid that complexity, but rather document only SETUP
notifiers can fail?

The problem is that failing a notifier at this stage (if migration already
finished) can already be too late; dest QEMU can already have started
running, so no way to roll back.  We can document that, check and assert
for !SETUP cases to make sure error is never hit?

> +
>      block_cleanup_parameters();
>      yank_unregister_instance(MIGRATION_YANK_INSTANCE);
>  }
> @@ -1481,13 +1493,15 @@ void migration_remove_notifier(NotifierWithReturn 
> *notify)
>      }
>  }
>  
> -void migration_call_notifiers(MigrationState *s, MigrationEventType type)
> +int migration_call_notifiers(MigrationState *s, MigrationEventType type,
> +                             Error **errp)
>  {
>      MigMode mode = s->parameters.mode;
>      MigrationEvent e;
>  
>      e.type = type;
> -    notifier_with_return_list_notify(&migration_state_notifiers[mode], &e, 
> 0);
> +    return notifier_with_return_list_notify(&migration_state_notifiers[mode],
> +                                            &e, errp);
>  }
>  
>  bool migration_in_setup(MigrationState *s)
> @@ -2535,7 +2549,9 @@ static int postcopy_start(MigrationState *ms, Error 
> **errp)
>       * at the transition to postcopy and after the device state; in 
> particular
>       * spice needs to trigger a transition now
>       */
> -    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE);
> +    if (migration_call_notifiers(ms, MIG_EVENT_PRECOPY_DONE, errp)) {
> +        goto fail;
> +    }
>  
>      migration_downtime_end(ms);
>  
> @@ -2555,11 +2571,10 @@ static int postcopy_start(MigrationState *ms, Error 
> **errp)
>  
>      ret = qemu_file_get_error(ms->to_dst_file);
>      if (ret) {
> -        error_setg(errp, "postcopy_start: Migration stream errored");
> -        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                              MIGRATION_STATUS_FAILED);
> +        error_setg_errno(errp, -ret, "postcopy_start: Migration stream 
> error");
> +        bql_lock();
> +        goto fail;
>      }
> -
>      trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
>  
>      return ret;
> @@ -2580,6 +2595,7 @@ fail:
>              error_report_err(local_err);
>          }
>      }
> +    migration_call_notifiers(ms, MIG_EVENT_PRECOPY_FAILED, NULL);
>      bql_unlock();
>      return -1;
>  }
> @@ -3594,7 +3610,9 @@ void migrate_fd_connect(MigrationState *s, Error 
> *error_in)
>          rate_limit = migrate_max_bandwidth();
>  
>          /* Notify before starting migration thread */
> -        migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP);
> +        if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_SETUP, 
> &local_err)) {
> +            goto fail;
> +        }
>      }
>  
>      migration_rate_set(rate_limit);
> -- 
> 1.8.3.1
> 

-- 
Peter Xu


Reply via email to