Currently, it is possible that a live migration never finishes, when the dirty page rate is high compared to the scan/transfer rate. The exact values for MAX_MEMORY_ITERATIONS and MAX_TOTAL_MEMORY_TRANSFER_FACTOR are arguable, but there should be *some* limit to force the final iteration of a live migration that does not converge.
--- arch_init.c | 10 +++++++++- 1 files changed, 9 insertions(+), 1 deletions(-) diff --git a/arch_init.c b/arch_init.c index 4486925..57fcb1e 100644 --- a/arch_init.c +++ b/arch_init.c @@ -89,6 +89,9 @@ const uint32_t arch_type = QEMU_ARCH; #define RAM_SAVE_FLAG_EOS 0x10 #define RAM_SAVE_FLAG_CONTINUE 0x20 +#define MAX_MEMORY_ITERATIONS 10 +#define MAX_TOTAL_MEMORY_TRANSFER_FACTOR 3 + static int is_dup_page(uint8_t *page, uint8_t ch) { uint32_t val = ch << 24 | ch << 16 | ch << 8 | ch; @@ -107,6 +110,8 @@ static int is_dup_page(uint8_t *page, uint8_t ch) static RAMBlock *last_block; static ram_addr_t last_offset; +static int numberFullMemoryIterations = 0; + static int ram_save_block(QEMUFile *f) { RAMBlock *block = last_block; @@ -158,7 +163,10 @@ static int ram_save_block(QEMUFile *f) offset = 0; block = QLIST_NEXT(block, next); if (!block) + { + numberFullMemoryIterations++; block = QLIST_FIRST(&ram_list.blocks); + } } current_addr = block->offset + offset; @@ -295,7 +303,7 @@ int ram_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) expected_time = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth; - return (stage == 2) && (expected_time <= migrate_max_downtime()); + return (stage == 2) && ((expected_time <= migrate_max_downtime() || (numberFullMemoryIterations == MAX_MEMORY_ITERATIONS) || (bytes_transferred > (MAX_TOTAL_MEMORY_TRANSFER_FACTOR*ram_bytes_total())))); } static inline void *host_from_stream_offset(QEMUFile *f, -- 1.7.0.4