Make sure master start block replication after slave's block replication started.
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com> Signed-off-by: Wen Congyang <we...@cn.fujitsu.com> Signed-off-by: Yang Hongyang <yan...@cn.fujitsu.com> Signed-off-by: Li Zhijian <lizhij...@cn.fujitsu.com> --- migration/colo.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- trace-events | 2 ++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/migration/colo.c b/migration/colo.c index 44b0f93..37d958c 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -20,6 +20,7 @@ #include "qapi-event.h" #include "net/colo-nic.h" #include "qmp-commands.h" +#include "block/block_int.h" /* * We should not do checkpoint one after another without any time interval, @@ -134,6 +135,7 @@ static void secondary_vm_do_failover(void) { int old_state; MigrationIncomingState *mis = migration_incoming_get_current(); + Error *local_err = NULL; /* Can not do failover during the process of VM's loading VMstate, Or * it will break the secondary VM. @@ -156,6 +158,12 @@ static void secondary_vm_do_failover(void) } colo_proxy_destroy(COLO_MODE_SECONDARY); + bdrv_stop_replication_all(true, &local_err); + if (local_err) { + error_report_err(local_err); + } + trace_colo_stop_block_replication("failover"); + if (!autostart) { error_report("\"-S\" qemu option will be ignored in secondary side"); /* recover runstate to normal migration finish state */ @@ -184,6 +192,7 @@ static void primary_vm_do_failover(void) { MigrationState *s = migrate_get_current(); int old_state; + Error *local_err = NULL; colo_proxy_destroy(COLO_MODE_PRIMARY); @@ -201,6 +210,12 @@ static void primary_vm_do_failover(void) } qemu_bh_schedule(s->cleanup_bh); + bdrv_stop_replication_all(true, &local_err); + if (local_err) { + error_report_err(local_err); + } + trace_colo_stop_block_replication("failover"); + vm_start(); old_state = failover_set_state(FAILOVER_STATUS_HANDLING, @@ -282,6 +297,7 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) int colo_shutdown, ret; size_t size; QEMUFile *trans = NULL; + Error *local_err = NULL; ret = colo_ctl_put(s->file, COLO_CHECKPOINT_NEW); if (ret < 0) { @@ -335,6 +351,16 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) goto out; } + /* we call this api although this may do nothing on primary side */ + qemu_mutex_lock_iothread(); + bdrv_do_checkpoint_all(&local_err); + qemu_mutex_unlock_iothread(); + if (local_err) { + error_report_err(local_err); + ret = -1; + goto out; + } + ret = colo_ctl_put(s->file, COLO_CHECKPOINT_SEND); if (ret < 0) { goto out; @@ -364,6 +390,10 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) } if (colo_shutdown) { + qemu_mutex_lock_iothread(); + bdrv_stop_replication_all(false, NULL); + trace_colo_stop_block_replication("shutdown"); + qemu_mutex_unlock_iothread(); colo_ctl_put(s->file, COLO_GUEST_SHUTDOWN); qemu_fflush(s->file); colo_shutdown_requested = 0; @@ -393,6 +423,7 @@ static void *colo_thread(void *opaque) QEMUFile *colo_control = NULL; int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); int i, ret; + Error *local_err = NULL; failover_init_state(); if (colo_proxy_init(COLO_MODE_PRIMARY) != 0) { @@ -422,6 +453,12 @@ static void *colo_thread(void *opaque) } qemu_mutex_lock_iothread(); + /* start block replication */ + bdrv_start_replication_all(REPLICATION_MODE_PRIMARY, &local_err); + if (local_err) { + goto out; + } + trace_colo_start_block_replication(); vm_start(); qemu_mutex_unlock_iothread(); trace_colo_vm_state_change("stop", "run"); @@ -472,7 +509,12 @@ do_checkpoint: } out: - error_report("colo: some error happens in colo_thread"); + if (local_err) { + error_report_err(local_err); + } else { + error_report("colo: some error happens in colo_thread"); + } + if (colo_control) { qemu_fclose(colo_control); } @@ -544,6 +586,8 @@ static int colo_wait_handle_cmd(QEMUFile *f, int *checkpoint_request) case COLO_GUEST_SHUTDOWN: qemu_mutex_lock_iothread(); vm_stop_force_state(RUN_STATE_COLO); + bdrv_stop_replication_all(false, NULL); + trace_colo_stop_block_replication("shutdown"); qemu_system_shutdown_request_core(); qemu_mutex_unlock_iothread(); /* the main thread will exit and termiante the whole @@ -563,6 +607,7 @@ void *colo_process_incoming_checkpoints(void *opaque) QEMUFile *ctl = NULL, *fb = NULL; uint64_t total_size; int i, ret; + Error *local_err = NULL; migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COLO); @@ -590,6 +635,15 @@ void *colo_process_incoming_checkpoints(void *opaque) goto out; } + qemu_mutex_lock_iothread(); + /* start block replication */ + bdrv_start_replication_all(REPLICATION_MODE_SECONDARY, &local_err); + if (local_err) { + goto out; + } + qemu_mutex_unlock_iothread(); + trace_colo_start_block_replication(); + ret = colo_ctl_put(ctl, COLO_CHECPOINT_READY); if (ret < 0) { goto out; @@ -674,8 +728,15 @@ void *colo_process_incoming_checkpoints(void *opaque) goto out; } - vmstate_loading = false; + /* discard colo disk buffer */ + bdrv_do_checkpoint_all(&local_err); qemu_mutex_unlock_iothread(); + if (local_err) { + vmstate_loading = false; + goto out; + } + + vmstate_loading = false; if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) { failover_set_state(FAILOVER_STATUS_RELAUNCH, FAILOVER_STATUS_NONE); @@ -698,7 +759,11 @@ void *colo_process_incoming_checkpoints(void *opaque) } out: - error_report("Detect some error or get a failover request"); + if (local_err) { + error_report_err(local_err); + } else { + error_report("Detect some error or get a failover request"); + } /* * Here, we raise a qmp event to the user, * It can help user to know what happens, and help deciding whether to diff --git a/trace-events b/trace-events index 3e31809..03cd035 100644 --- a/trace-events +++ b/trace-events @@ -1478,6 +1478,8 @@ colo_ctl_put(const char *msg) "Send '%s'" colo_ctl_get(const char *msg) "Receive '%s'" colo_failover_set_state(int new_state) "new state %d" colo_rcv_pkt(int result) "Result of net packets comparing is different: %d" +colo_start_block_replication(void) "Block replication is started" +colo_stop_block_replication(const char *reason) "Block replication is stopped(reason: '%s')" # kvm-all.c kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" -- 1.8.3.1