Move to stage3 only when remaining work can be done below max downtime. Changes from v1: remove max iterations. Try to infer storage performance and by that calculate remaining work.
Signed-off-by: Liran Schour <lir...@il.ibm.com> --- block-migration.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 132 insertions(+), 4 deletions(-) diff --git a/block-migration.c b/block-migration.c index 16df75f..5ef3eb8 100644 --- a/block-migration.c +++ b/block-migration.c @@ -17,6 +17,7 @@ #include "qemu-queue.h" #include "monitor.h" #include "block-migration.h" +#include "migration.h" #include <assert.h> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) @@ -60,6 +61,7 @@ typedef struct BlkMigBlock { QEMUIOVector qiov; BlockDriverAIOCB *aiocb; int ret; + long double time; QSIMPLEQ_ENTRY(BlkMigBlock) entry; } BlkMigBlock; @@ -74,11 +76,79 @@ typedef struct BlkMigState { int64_t total_sector_sum; int prev_progress; int bulk_completed; - int dirty_iterations; + long double total_time; + int reads; } BlkMigState; static BlkMigState block_mig_state; +static int64_t get_clock_realtime(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000); +} + +#ifdef WIN32 + +static int64_t clock_freq; + +static void init_get_clock(void) +{ + LARGE_INTEGER freq; + int ret; + ret = QueryPerformanceFrequency(&freq); + if (ret == 0) { + fprintf(stderr, "Could not calibrate ticks\n"); + exit(1); + } + clock_freq = freq.QuadPart; +} + +static int64_t get_clock(void) +{ + LARGE_INTEGER ti; + QueryPerformanceCounter(&ti); + return muldiv64(ti.QuadPart, get_ticks_per_sec(), clock_freq); +} + +#else + +static int use_rt_clock; + +static void init_get_clock(void) +{ + use_rt_clock = 0; +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \ + || defined(__DragonFly__) || defined(__FreeBSD_kernel__) + { + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) { + use_rt_clock = 1; + } + } +#endif +} + +static int64_t get_clock(void) +{ +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \ + || defined(__DragonFly__) || defined(__FreeBSD_kernel__) + if (use_rt_clock) { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000LL + ts.tv_nsec; + } else +#endif + { + /* XXX: using gettimeofday leads to problems if the date + changes, so it should be avoided. */ + return get_clock_realtime(); + } +} +#endif + static void blk_send(QEMUFile *f, BlkMigBlock * blk) { int len; @@ -127,12 +197,28 @@ uint64_t blk_mig_bytes_total(void) return sum << BDRV_SECTOR_BITS; } +static inline void add_avg_read_time(long double time) +{ + block_mig_state.reads++; + block_mig_state.total_time += time; +} + +static inline long double compute_read_bwidth(void) +{ + assert(block_mig_state.total_time != 0); + return (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time; +} + static void blk_mig_read_cb(void *opaque, int ret) { BlkMigBlock *blk = opaque; blk->ret = ret; + blk->time = get_clock() - blk->time; + + add_avg_read_time(blk->time); + QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); block_mig_state.submitted--; @@ -182,6 +268,8 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + blk->time = get_clock(); + blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, nr_sectors, blk_mig_read_cb, blk); if (!blk->aiocb) { @@ -223,6 +311,8 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) block_mig_state.total_sector_sum = 0; block_mig_state.prev_progress = -1; block_mig_state.bulk_completed = 0; + block_mig_state.total_time = 0; + block_mig_state.reads = 0; for (bs = bdrv_first; bs != NULL; bs = bs->next) { if (bs->type == BDRV_TYPE_HD) { @@ -321,6 +411,8 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, blk->iov.iov_base = blk->buf; blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + + blk->time = get_clock(); blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, nr_sectors, blk_mig_read_cb, blk); @@ -403,10 +495,42 @@ static void flush_blks(QEMUFile* f) block_mig_state.transferred); } +static int64_t get_remaining_dirty(void) +{ + BlkMigDevState *bmds; + int64_t dirty = 0; + + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + dirty += bdrv_get_dirty_count(bmds->bs); + } + + return dirty * BLOCK_SIZE; +} + static int is_stage2_completed(void) { - return (block_mig_state.submitted == 0 && - block_mig_state.bulk_completed); + int64_t remaining_dirty; + long double bwidth; + + if (block_mig_state.bulk_completed == 1) { + + remaining_dirty = get_remaining_dirty(); + if(remaining_dirty == 0) { + return 1; + } + + bwidth = compute_read_bwidth(); + + if ((remaining_dirty / bwidth) <= + migrate_max_downtime()) { + /* finish stage2 because we think that we can finish remaing work + below max_downtime */ + + return 1; + } + } + + return 0; } static void blk_mig_cleanup(Monitor *mon) @@ -490,7 +614,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) } if (stage == 3) { - /* we now for sure that save bulk is completed */ + /* we know for sure that save bulk is completed and + all async read completed */ + assert(block_mig_state.submitted == 0); while(blk_mig_save_dirty_block(mon, f, 0) != 0); blk_mig_cleanup(mon); @@ -580,4 +706,6 @@ void blk_mig_init(void) register_savevm_live("block", 0, 1, block_set_params, block_save_live, NULL, block_load, &block_mig_state); + + init_get_clock(); } -- 1.6.0.4