[Qemu-devel] [RFC PATCH] Throttle-down guest when live migration does not converge.

Chegu Vinod Wed, 24 Apr 2013 18:49:43 -0700

Busy enterprise workloads hosted on large sized VM's tend to dirty
memory faster than the transfer rate achieved via live guest migration.
Despite some good recent improvements (& using dedicated 10Gig NICs
between hosts) the live migration does NOT converge.


A few options that were discussed/being-pursued to help with
the convergence issue include:

1) Slow down guest considerably via cgroup's CPU controls - requires
   libvirt client support to detect & trigger action, but conceptually
   similar to this RFC change.

2) Speed up transfer rate:
   - RDMA based Pre-copy - lower overhead and fast (Unfortunately
     has a few restrictions and some customers still choose not
     to deploy RDMA :-( ).
   - Add parallelism to improve transfer rate and use multiple 10Gig
     connections (bonded). - could add some overhead on the host.

3) Post-copy (preferably with RDMA) or a Pre+Post copy hybrid - Sounds
   promising but need to consider & handle newer failure scenarios.

The following [RFC] change attempts to auto-detect lack of convergence
situation and trigger a slowdown of the workload by explicitly
disallowing the VCPUs from spending much time in the VM context. 
No exernal trigger is required (unlike option 1) and it can co-exist
with enhancements being pursued as part of Option 2 (e.g. RDMA).

The migration thread tries to catchup and this eventually leads
to convergence in some "deterministic" amount of time. Yes it does
impact the performance of all the VCPUs but in my observation that
lasts only for a short duration of time. i.e. we end up entering
stage 3 (downtime phase) soon after that.

Verified the convergence using the following:
- SpecJbb2005 workload running on a 20VCPU/128G guest(~80% busy)
- OLTP like workload running on a 80VCPU/512G guest (~80% busy)

Thanks to Juan and Paolo for some useful suggestions. More
refinment is needed (e.g. smarter way to detect & variable
throttling based on need etc). For now I was hoping to get
some feedback or hear about other more refined ideas.

Signed-off-by: Chegu Vinod <chegu_vi...@hp.com>
---
 arch_init.c                   |   37 +++++++++++++++++++++++++++++++
 cpus.c                        |   12 ++++++++++
 include/migration/migration.h |    9 +++++++
 include/qemu/main-loop.h      |    3 ++
 kvm-all.c                     |   49 +++++++++++++++++++++++++++++++++++++++++
 migration.c                   |    6 +++++
 6 files changed, 116 insertions(+), 0 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 92de1bd..a06ff81 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -104,6 +104,7 @@ int graphic_depth = 15;
 #endif
 
 const uint32_t arch_type = QEMU_ARCH;
+static uint64_t mig_throttle_on;
 
 /***********************************************************/
 /* ram save/restore */
@@ -379,12 +380,19 @@ static void migration_bitmap_sync(void)
     MigrationState *s = migrate_get_current();
     static int64_t start_time;
     static int64_t num_dirty_pages_period;
+    static int64_t bytes_xfer_prev;
     int64_t end_time;
+    int64_t bytes_xfer_now;
+    static int dirty_rate_high_cnt;
 
     if (!start_time) {
         start_time = qemu_get_clock_ms(rt_clock);
     }
 
+    if (!bytes_xfer_prev) {
+        bytes_xfer_prev = ram_bytes_transferred();
+    }
+
     trace_migration_bitmap_sync_start();
     memory_global_sync_dirty_bitmap(get_system_memory());
 
@@ -404,6 +412,23 @@ static void migration_bitmap_sync(void)
 
     /* more than 1 second = 1000 millisecons */
     if (end_time > start_time + 1000) {
+         /* The following detection logic can be refined later. For now:
+          Check to see if the dirtied bytes is 50% more than the approx.
+          amount of bytes that just got transferred since the last time we
+          were in this routine. If that happens N times (for now N==5)
+          we turn on the throttle down logic */
+         bytes_xfer_now = ram_bytes_transferred();
+         if (s->dirty_pages_rate &&
+             ((num_dirty_pages_period*TARGET_PAGE_SIZE) >
+             ((bytes_xfer_now - bytes_xfer_prev)/2))) {
+             if (dirty_rate_high_cnt++ > 5) {
+                 DPRINTF("Unable to converge. Throtting down guest\n");
+                 mig_throttle_on = 1;
+             }
+        }
+        bytes_xfer_prev = bytes_xfer_now;
+
         s->dirty_pages_rate = num_dirty_pages_period * 1000
             / (end_time - start_time);
         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
@@ -496,6 +521,18 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
     return bytes_sent;
 }
 
+bool throttling_needed(void)
+{
+    bool value;
+
+    qemu_mutex_lock_mig_throttle();
+    value = mig_throttle_on;
+    qemu_mutex_unlock_mig_throttle();
+
+    if (value) {
+        return true;
+    }
+    return false;
+}
+
 static uint64_t bytes_transferred;
 
 static ram_addr_t ram_save_remaining(void)
diff --git a/cpus.c b/cpus.c
index 5a98a37..eea6601 100644
--- a/cpus.c
+++ b/cpus.c
@@ -616,6 +616,7 @@ static void qemu_tcg_init_cpu_signals(void)
 #endif /* _WIN32 */
 
 static QemuMutex qemu_global_mutex;
+static QemuMutex qemu_mig_throttle_mutex;
 static QemuCond qemu_io_proceeded_cond;
 static bool iothread_requesting_mutex;
 
@@ -638,6 +639,7 @@ void qemu_init_cpu_loop(void)
     qemu_cond_init(&qemu_work_cond);
     qemu_cond_init(&qemu_io_proceeded_cond);
     qemu_mutex_init(&qemu_global_mutex);
+    qemu_mutex_init(&qemu_mig_throttle_mutex);
 
     qemu_thread_get_self(&io_thread);
 }
@@ -923,6 +925,16 @@ static bool qemu_in_vcpu_thread(void)
     return cpu_single_env && qemu_cpu_is_self(ENV_GET_CPU(cpu_single_env));
 }
 
+void qemu_mutex_lock_mig_throttle(void)
+{
+    qemu_mutex_lock(&qemu_mig_throttle_mutex);
+}
+
+void qemu_mutex_unlock_mig_throttle(void)
+{
+    qemu_mutex_unlock(&qemu_mig_throttle_mutex);
+}
+
 void qemu_mutex_lock_iothread(void)
 {
     if (!tcg_enabled()) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index e2acec6..cccee91 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -92,6 +92,15 @@ uint64_t ram_bytes_remaining(void);
 uint64_t ram_bytes_transferred(void);
 uint64_t ram_bytes_total(void);
 
+#ifndef _QEMU_MIG_THROTTLE
+#define _QEMU_MIG_THROTTLE
+
+bool throttling_needed(void);
+bool throttling_now(void);
+void *migration_throttle_down(void *);
+
+#endif
+
 extern SaveVMHandlers savevm_ram_handlers;
 
 uint64_t dup_mig_bytes_transferred(void);
diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
index 6f0200a..9a3886d 100644
--- a/include/qemu/main-loop.h
+++ b/include/qemu/main-loop.h
@@ -299,6 +299,9 @@ void qemu_mutex_lock_iothread(void);
  */
 void qemu_mutex_unlock_iothread(void);
 
+void qemu_mutex_lock_mig_throttle(void);
+void qemu_mutex_unlock_mig_throttle(void);
+
 /* internal interfaces */
 
 void qemu_fd_register(int fd);
diff --git a/kvm-all.c b/kvm-all.c
index 2d92721..95010ce 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -33,6 +33,8 @@
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 #include "qemu/event_notifier.h"
+#include "sysemu/cpus.h"
+#include "migration/migration.h"
 
 /* This check must be after config-host.h is included */
 #ifdef CONFIG_EVENTFD
@@ -116,6 +118,8 @@ static const KVMCapabilityInfo kvm_required_capabilites[] = 
{
     KVM_CAP_LAST_INFO
 };
 
+static void mig_delay_vcpu(void);
+
 static KVMSlot *kvm_alloc_slot(KVMState *s)
 {
     int i;
@@ -1609,6 +1613,10 @@ int kvm_cpu_exec(CPUArchState *env)
         }
         qemu_mutex_unlock_iothread();
 
+        if (throttling_needed()) {
+            mig_delay_vcpu();
+        }
+
         run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
 
         qemu_mutex_lock_iothread();
@@ -2032,3 +2040,44 @@ int kvm_on_sigbus(int code, void *addr)
 {
     return kvm_arch_on_sigbus(code, addr);
 }
+
+static bool throttling;
+bool throttling_now(void)
+{
+    if (throttling) {
+        return true;
+    }
+    return false;
+}
+
+static void mig_delay_vcpu(void)
+{
+    g_usleep(50*1000);
+}
+
+/* Stub used for getting the vcpu out of VM and into qemu via
+   run_on_cpu()*/
+static void mig_kick_cpu(void *opq)
+{
+    return;
+}
+
+/* To reduce the dirty rate explicitly disallow the VCPUs from spending
+   much time in the VM. The migration thread will try to catchup.
+   Workload will experience a greater performance drop but for a shorter
+   duration.
+*/
+void *migration_throttle_down(void *opaque)
+{
+    throttling = true;
+    while (throttling_needed()) {
+        CPUArchState *penv = first_cpu;
+        while (penv) {
+            qemu_mutex_lock_iothread();
+            run_on_cpu(ENV_GET_CPU(penv), mig_kick_cpu, NULL);
+            qemu_mutex_unlock_iothread();
+            penv = penv->next_cpu;
+        }
+        g_usleep(25*1000);
+    }
+    throttling = false;
+    return NULL;
+}
diff --git a/migration.c b/migration.c
index 3eb0fad..a464afc 100644
--- a/migration.c
+++ b/migration.c
@@ -24,6 +24,7 @@
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
+#include "sysemu/cpus.h"
 
 //#define DEBUG_MIGRATION
 
@@ -503,6 +504,7 @@ static void *migration_thread(void *opaque)
     int64_t max_size = 0;
     int64_t start_time = initial_time;
     bool old_vm_running = false;
+    QemuThread thread;
 
     DPRINTF("beginning savevm\n");
     qemu_savevm_state_begin(s->file, &s->params);
@@ -517,6 +519,10 @@ static void *migration_thread(void *opaque)
             DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
             if (pending_size && pending_size >= max_size) {
                 qemu_savevm_state_iterate(s->file);
+                if (throttling_needed() && !throttling_now()) {
+                    qemu_thread_create(&thread, migration_throttle_down,
+                               NULL, QEMU_THREAD_DETACHED);
+                }
             } else {
                 DPRINTF("done iterating\n");
                 qemu_mutex_lock_iothread();
-- 
1.7.1

[Qemu-devel] [RFC PATCH] Throttle-down guest when live migration does not converge.

Reply via email to