Busy enterprise workloads hosted on large sized VM's tend to dirty memory faster than the transfer rate achieved via live guest migration. Despite some good recent improvements (& using dedicated 10Gig NICs between hosts) the live migration does NOT converge.
A few options that were discussed/being-pursued to help with the convergence issue include: 1) Slow down guest considerably via cgroup's CPU controls - requires libvirt client support to detect & trigger action, but conceptually similar to this RFC change. 2) Speed up transfer rate: - RDMA based Pre-copy - lower overhead and fast (Unfortunately has a few restrictions and some customers still choose not to deploy RDMA :-( ). - Add parallelism to improve transfer rate and use multiple 10Gig connections (bonded). - could add some overhead on the host. 3) Post-copy (preferably with RDMA) or a Pre+Post copy hybrid - Sounds promising but need to consider & handle newer failure scenarios. The following [RFC] change attempts to auto-detect lack of convergence situation and trigger a slowdown of the workload by explicitly disallowing the VCPUs from spending much time in the VM context. No exernal trigger is required (unlike option 1) and it can co-exist with enhancements being pursued as part of Option 2 (e.g. RDMA). The migration thread tries to catchup and this eventually leads to convergence in some "deterministic" amount of time. Yes it does impact the performance of all the VCPUs but in my observation that lasts only for a short duration of time. i.e. we end up entering stage 3 (downtime phase) soon after that. Verified the convergence using the following: - SpecJbb2005 workload running on a 20VCPU/128G guest(~80% busy) - OLTP like workload running on a 80VCPU/512G guest (~80% busy) Thanks to Juan and Paolo for some useful suggestions. More refinment is needed (e.g. smarter way to detect & variable throttling based on need etc). For now I was hoping to get some feedback or hear about other more refined ideas. Signed-off-by: Chegu Vinod <chegu_vi...@hp.com> --- arch_init.c | 37 +++++++++++++++++++++++++++++++ cpus.c | 12 ++++++++++ include/migration/migration.h | 9 +++++++ include/qemu/main-loop.h | 3 ++ kvm-all.c | 49 +++++++++++++++++++++++++++++++++++++++++ migration.c | 6 +++++ 6 files changed, 116 insertions(+), 0 deletions(-) diff --git a/arch_init.c b/arch_init.c index 92de1bd..a06ff81 100644 --- a/arch_init.c +++ b/arch_init.c @@ -104,6 +104,7 @@ int graphic_depth = 15; #endif const uint32_t arch_type = QEMU_ARCH; +static uint64_t mig_throttle_on; /***********************************************************/ /* ram save/restore */ @@ -379,12 +380,19 @@ static void migration_bitmap_sync(void) MigrationState *s = migrate_get_current(); static int64_t start_time; static int64_t num_dirty_pages_period; + static int64_t bytes_xfer_prev; int64_t end_time; + int64_t bytes_xfer_now; + static int dirty_rate_high_cnt; if (!start_time) { start_time = qemu_get_clock_ms(rt_clock); } + if (!bytes_xfer_prev) { + bytes_xfer_prev = ram_bytes_transferred(); + } + trace_migration_bitmap_sync_start(); memory_global_sync_dirty_bitmap(get_system_memory()); @@ -404,6 +412,23 @@ static void migration_bitmap_sync(void) /* more than 1 second = 1000 millisecons */ if (end_time > start_time + 1000) { + /* The following detection logic can be refined later. For now: + Check to see if the dirtied bytes is 50% more than the approx. + amount of bytes that just got transferred since the last time we + were in this routine. If that happens N times (for now N==5) + we turn on the throttle down logic */ + bytes_xfer_now = ram_bytes_transferred(); + if (s->dirty_pages_rate && + ((num_dirty_pages_period*TARGET_PAGE_SIZE) > + ((bytes_xfer_now - bytes_xfer_prev)/2))) { + if (dirty_rate_high_cnt++ > 5) { + DPRINTF("Unable to converge. Throtting down guest\n"); + mig_throttle_on = 1; + } + } + bytes_xfer_prev = bytes_xfer_now; + s->dirty_pages_rate = num_dirty_pages_period * 1000 / (end_time - start_time); s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; @@ -496,6 +521,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage) return bytes_sent; } +bool throttling_needed(void) +{ + bool value; + + qemu_mutex_lock_mig_throttle(); + value = mig_throttle_on; + qemu_mutex_unlock_mig_throttle(); + + if (value) { + return true; + } + return false; +} + static uint64_t bytes_transferred; static ram_addr_t ram_save_remaining(void) diff --git a/cpus.c b/cpus.c index 5a98a37..eea6601 100644 --- a/cpus.c +++ b/cpus.c @@ -616,6 +616,7 @@ static void qemu_tcg_init_cpu_signals(void) #endif /* _WIN32 */ static QemuMutex qemu_global_mutex; +static QemuMutex qemu_mig_throttle_mutex; static QemuCond qemu_io_proceeded_cond; static bool iothread_requesting_mutex; @@ -638,6 +639,7 @@ void qemu_init_cpu_loop(void) qemu_cond_init(&qemu_work_cond); qemu_cond_init(&qemu_io_proceeded_cond); qemu_mutex_init(&qemu_global_mutex); + qemu_mutex_init(&qemu_mig_throttle_mutex); qemu_thread_get_self(&io_thread); } @@ -923,6 +925,16 @@ static bool qemu_in_vcpu_thread(void) return cpu_single_env && qemu_cpu_is_self(ENV_GET_CPU(cpu_single_env)); } +void qemu_mutex_lock_mig_throttle(void) +{ + qemu_mutex_lock(&qemu_mig_throttle_mutex); +} + +void qemu_mutex_unlock_mig_throttle(void) +{ + qemu_mutex_unlock(&qemu_mig_throttle_mutex); +} + void qemu_mutex_lock_iothread(void) { if (!tcg_enabled()) { diff --git a/include/migration/migration.h b/include/migration/migration.h index e2acec6..cccee91 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -92,6 +92,15 @@ uint64_t ram_bytes_remaining(void); uint64_t ram_bytes_transferred(void); uint64_t ram_bytes_total(void); +#ifndef _QEMU_MIG_THROTTLE +#define _QEMU_MIG_THROTTLE + +bool throttling_needed(void); +bool throttling_now(void); +void *migration_throttle_down(void *); + +#endif + extern SaveVMHandlers savevm_ram_handlers; uint64_t dup_mig_bytes_transferred(void); diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h index 6f0200a..9a3886d 100644 --- a/include/qemu/main-loop.h +++ b/include/qemu/main-loop.h @@ -299,6 +299,9 @@ void qemu_mutex_lock_iothread(void); */ void qemu_mutex_unlock_iothread(void); +void qemu_mutex_lock_mig_throttle(void); +void qemu_mutex_unlock_mig_throttle(void); + /* internal interfaces */ void qemu_fd_register(int fd); diff --git a/kvm-all.c b/kvm-all.c index 2d92721..95010ce 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -33,6 +33,8 @@ #include "exec/memory.h" #include "exec/address-spaces.h" #include "qemu/event_notifier.h" +#include "sysemu/cpus.h" +#include "migration/migration.h" /* This check must be after config-host.h is included */ #ifdef CONFIG_EVENTFD @@ -116,6 +118,8 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = { KVM_CAP_LAST_INFO }; +static void mig_delay_vcpu(void); + static KVMSlot *kvm_alloc_slot(KVMState *s) { int i; @@ -1609,6 +1613,10 @@ int kvm_cpu_exec(CPUArchState *env) } qemu_mutex_unlock_iothread(); + if (throttling_needed()) { + mig_delay_vcpu(); + } + run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0); qemu_mutex_lock_iothread(); @@ -2032,3 +2040,44 @@ int kvm_on_sigbus(int code, void *addr) { return kvm_arch_on_sigbus(code, addr); } + +static bool throttling; +bool throttling_now(void) +{ + if (throttling) { + return true; + } + return false; +} + +static void mig_delay_vcpu(void) +{ + g_usleep(50*1000); +} + +/* Stub used for getting the vcpu out of VM and into qemu via + run_on_cpu()*/ +static void mig_kick_cpu(void *opq) +{ + return; +} + +/* To reduce the dirty rate explicitly disallow the VCPUs from spending + much time in the VM. The migration thread will try to catchup. + Workload will experience a greater performance drop but for a shorter + duration. +*/ +void *migration_throttle_down(void *opaque) +{ + throttling = true; + while (throttling_needed()) { + CPUArchState *penv = first_cpu; + while (penv) { + qemu_mutex_lock_iothread(); + run_on_cpu(ENV_GET_CPU(penv), mig_kick_cpu, NULL); + qemu_mutex_unlock_iothread(); + penv = penv->next_cpu; + } + g_usleep(25*1000); + } + throttling = false; + return NULL; +} diff --git a/migration.c b/migration.c index 3eb0fad..a464afc 100644 --- a/migration.c +++ b/migration.c @@ -24,6 +24,7 @@ #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" +#include "sysemu/cpus.h" //#define DEBUG_MIGRATION @@ -503,6 +504,7 @@ static void *migration_thread(void *opaque) int64_t max_size = 0; int64_t start_time = initial_time; bool old_vm_running = false; + QemuThread thread; DPRINTF("beginning savevm\n"); qemu_savevm_state_begin(s->file, &s->params); @@ -517,6 +519,10 @@ static void *migration_thread(void *opaque) DPRINTF("pending size %lu max %lu\n", pending_size, max_size); if (pending_size && pending_size >= max_size) { qemu_savevm_state_iterate(s->file); + if (throttling_needed() && !throttling_now()) { + qemu_thread_create(&thread, migration_throttle_down, + NULL, QEMU_THREAD_DETACHED); + } } else { DPRINTF("done iterating\n"); qemu_mutex_lock_iothread(); -- 1.7.1