* zhanghailiang (zhang.zhanghaili...@huawei.com) wrote: > If the net connection between primary host and secondary host breaks > while COLO/COLO incoming threads are doing read() or write(). > It will block until connection is timeout, and the failover process > will be blocked because of it. > > So it is necessary to shutdown all the socket fds used by COLO > to avoid this situation. Besides, we should close the corresponding > file descriptors after failvoer BH shutdown them, > Or there will be an error.
Hi, There are two parts to this patch: a) Add some semaphores to sequence failover b) Use shutdown() At first I wondered if perhaps they should be split; but I see the reason for the semaphores is mostly to stop the race between the fd's getting closed and the shutdown() calls; so I think it's OK. Do you have any problems with these semaphores during powering off the guest? Dave > Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com> > Signed-off-by: Li Zhijian <lizhij...@cn.fujitsu.com> > Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> > Cc: Dr. David Alan Gilbert <dgilb...@redhat.com> > --- > include/migration/migration.h | 3 +++ > migration/colo.c | 43 > +++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 46 insertions(+) > > diff --git a/include/migration/migration.h b/include/migration/migration.h > index 487ac1e..7cac877 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -113,6 +113,7 @@ struct MigrationIncomingState { > QemuThread colo_incoming_thread; > /* The coroutine we should enter (back) after failover */ > Coroutine *migration_incoming_co; > + QemuSemaphore colo_incoming_sem; > > /* See savevm.c */ > LoadStateEntry_Head loadvm_handlers; > @@ -182,6 +183,8 @@ struct MigrationState > QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) > src_page_requests; > /* The RAMBlock used in the last src_page_request */ > RAMBlock *last_req_rb; > + /* The semaphore is used to notify COLO thread that failover is finished > */ > + QemuSemaphore colo_exit_sem; > > /* The semaphore is used to notify COLO thread to do checkpoint */ > QemuSemaphore colo_checkpoint_sem; > diff --git a/migration/colo.c b/migration/colo.c > index 08b2e46..3222812 100644 > --- a/migration/colo.c > +++ b/migration/colo.c > @@ -59,6 +59,18 @@ static void secondary_vm_do_failover(void) > /* recover runstate to normal migration finish state */ > autostart = true; > } > + /* > + * Make sure COLO incoming thread not block in recv or send, > + * If mis->from_src_file and mis->to_src_file use the same fd, > + * The second shutdown() will return -1, we ignore this value, > + * It is harmless. > + */ > + if (mis->from_src_file) { > + qemu_file_shutdown(mis->from_src_file); > + } > + if (mis->to_src_file) { > + qemu_file_shutdown(mis->to_src_file); > + } > > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, > FAILOVER_STATUS_COMPLETED); > @@ -67,6 +79,8 @@ static void secondary_vm_do_failover(void) > "secondary VM", FailoverStatus_lookup[old_state]); > return; > } > + /* Notify COLO incoming thread that failover work is finished */ > + qemu_sem_post(&mis->colo_incoming_sem); > /* For Secondary VM, jump to incoming co */ > if (mis->migration_incoming_co) { > qemu_coroutine_enter(mis->migration_incoming_co); > @@ -81,6 +95,18 @@ static void primary_vm_do_failover(void) > migrate_set_state(&s->state, MIGRATION_STATUS_COLO, > MIGRATION_STATUS_COMPLETED); > > + /* > + * Wake up COLO thread which may blocked in recv() or send(), > + * The s->rp_state.from_dst_file and s->to_dst_file may use the > + * same fd, but we still shutdown the fd for twice, it is harmless. > + */ > + if (s->to_dst_file) { > + qemu_file_shutdown(s->to_dst_file); > + } > + if (s->rp_state.from_dst_file) { > + qemu_file_shutdown(s->rp_state.from_dst_file); > + } > + > old_state = failover_set_state(FAILOVER_STATUS_ACTIVE, > FAILOVER_STATUS_COMPLETED); > if (old_state != FAILOVER_STATUS_ACTIVE) { > @@ -88,6 +114,8 @@ static void primary_vm_do_failover(void) > FailoverStatus_lookup[old_state]); > return; > } > + /* Notify COLO thread that failover work is finished */ > + qemu_sem_post(&s->colo_exit_sem); > } > > void colo_do_failover(MigrationState *s) > @@ -361,6 +389,14 @@ out: > > timer_del(s->colo_delay_timer); > > + /* Hope this not to be too long to wait here */ > + qemu_sem_wait(&s->colo_exit_sem); > + qemu_sem_destroy(&s->colo_exit_sem); > + /* > + * Must be called after failover BH is completed, > + * Or the failover BH may shutdown the wrong fd that > + * re-used by other threads after we release here. > + */ > if (s->rp_state.from_dst_file) { > qemu_fclose(s->rp_state.from_dst_file); > } > @@ -385,6 +421,7 @@ void migrate_start_colo_process(MigrationState *s) > s->colo_delay_timer = timer_new_ms(QEMU_CLOCK_HOST, > colo_checkpoint_notify, s); > > + qemu_sem_init(&s->colo_exit_sem, 0); > migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, > MIGRATION_STATUS_COLO); > colo_process_checkpoint(s); > @@ -423,6 +460,8 @@ void *colo_process_incoming_thread(void *opaque) > uint64_t value; > Error *local_err = NULL; > > + qemu_sem_init(&mis->colo_incoming_sem, 0); > + > migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, > MIGRATION_STATUS_COLO); > > @@ -533,6 +572,10 @@ out: > qemu_fclose(fb); > } > > + /* Hope this not to be too long to loop here */ > + qemu_sem_wait(&mis->colo_incoming_sem); > + qemu_sem_destroy(&mis->colo_incoming_sem); > + /* Must be called after failover BH is completed */ > if (mis->to_src_file) { > qemu_fclose(mis->to_src_file); > } > -- > 1.8.3.1 > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK