Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel

Li Zhijian Mon, 29 Feb 2016 20:03:08 -0800

Great work!!
I like this patch very much.


On 02/29/2016 08:41 PM, Hailiang Zhang wrote:

On 2016/2/29 18:19, Dr. David Alan Gilbert (git) wrote:

From: "Dr. David Alan Gilbert" <dgilb...@redhat.com>

Flush the colo ram cache in parallel; use the same number
of threads as CPU cores.

On a VM with 4 cores, and 4GB RAM, I've seen a reduction from
~20ms to ~16ms using this, which is helpful but not as much
as I hoped;   I guess one problem might be that all the changes
could be concentrated in one area of RAM?  Perhaps another approach


Agreed. The dirty pages happened in continuous address are common in real
usage scene, Here, the idea of dividing ramblock by address, and flushing
them in parallel could be used in colo_init_ram_cache(),
where we need to backup the SVM's RAM into cache.


Agreed,  the backup job is very expensive currently, that is a good 
optimization.

Best regards.
Li Zhijian

would be to have one thread searching the bitmap and other threads
doing the copy with some type of work queueu.


I guess, in most case, the time of finding the dirty bitmap
is much faster than copy one page. So i think this method could be more
effective and more reasonable, but let's realize and test it first ;)

Thanks,
Hailiang

[Note: Not for merging at the moment, just for discussion with the COLO guys]

Signed-off-by: Dr. David Alan Gilbert <dgilb...@redhat.com>
---
  migration/ram.c | 215 +++++++++++++++++++++++++++++++++++++++++++++-----------
  1 file changed, 174 insertions(+), 41 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 188c3a1..6458863 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -42,6 +42,7 @@
  #include "qemu/rcu_queue.h"
  #include "migration/colo.h"
  #include "crypto/hash.h"
+#include "sysemu/sysemu.h"

  #ifdef DEBUG_MIGRATION_RAM
  #define DPRINTF(fmt, ...) \
@@ -616,18 +617,18 @@ static inline bool 
migration_bitmap_clear_dirty(ram_addr_t addr)
      ret = test_and_clear_bit(nr, bitmap);

      if (ret) {
-        migration_dirty_pages--;
+        atomic_dec(&migration_dirty_pages);
      }
      return ret;
  }

  static inline
  ram_addr_t ramlist_bitmap_find_and_reset_dirty(RAMBlock *rb,
-                                               ram_addr_t start)
+                                               ram_addr_t start,
+                                               uint64_t rb_size)
  {
      unsigned long base = rb->offset >> TARGET_PAGE_BITS;
      unsigned long nr = base + (start >> TARGET_PAGE_BITS);
-    uint64_t rb_size = rb->used_length;
      unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
      unsigned long next;

@@ -2721,6 +2722,141 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
      return ret;
  }

+/* For parallel flushing of colo ram cache; it has 'n' threads,
+ * trying 'n'==number of cpus, but might be smarter to do something NUMA
+ * Each thread waits for 'triggers' and signals 'completes'.
+ */
+typedef struct ram_cache_threads {
+    QemuThread *threads;
+    QemuEvent  *triggers;
+    QemuEvent  *completes;
+    bool        quit;
+} ColoRamCacheThreads;
+
+static ColoRamCacheThreads colo_flush_threads;
+
+/*
+ * Helper for colo_flush_thread.
+ * Given the current block and the bounds of ram (in pages)
+ * that we're dealing with, then find our next useable block,
+ * and set offset_out/used_out to the limits we're interested in.
+ * Result is NULL when we run out of blocks.
+ */
+static RAMBlock *flush_find_next_block(RAMBlock *block,
+                                       ram_addr_t lower_bound,
+                                       ram_addr_t upper_bound,
+                                       ram_addr_t *offset_out,
+                                       ram_addr_t *used_out)
+{
+    do {
+        if (!block) {
+            block = QLIST_FIRST_RCU(&ram_list.blocks);
+        } else {
+            block = QLIST_NEXT_RCU(block, next);
+        }
+        if (block &&
+            (block->offset + block->used_length) >= lower_bound &&
+            (block->offset < upper_bound)) {
+            /* OK, good, the block is at least partially within our bounds */
+            if (block->offset <= lower_bound) {
+                *offset_out = lower_bound - block->offset;
+            } else {
+                *offset_out = 0;
+            }
+            if ((block->offset + block->used_length) >= upper_bound) {
+                *used_out = upper_bound - block->offset;
+            } else {
+                *used_out = block->used_length;
+            }
+            break;
+        }
+    } while (block);
+
+    return block;
+}
+
+/* Flush thread for COLO ram cache, synchronises a proportion of the
+ * ram cache.
+ */
+static void *colo_flush_thread(void *opaque)
+{
+    int i = (int)(intptr_t)opaque;
+    int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+    ram_addr_t lower, upper;
+    /* work out our range, lower..upper-1
+     * want to avoid SMP issues on the dirty bitmap, so make sure
+     * our limits never land on the same 64bit word
+     */
+    ram_addr_t chunk_size = (ram_cache_pages / smp_cpus) & ~63ul;
+
+    lower = i * chunk_size;
+
+    if (i != (smp_cpus - 1)) {
+        upper = (i + 1) * chunk_size;
+    } else {
+        /* Last thread gets deals with extra few pages due to rounding */
+        upper = ram_cache_pages;
+    }
+    lower <<= TARGET_PAGE_BITS;
+    upper <<= TARGET_PAGE_BITS;
+
+    while (true) {
+        RAMBlock *block = NULL;
+        void *dst_host;
+        void *src_host;
+        ram_addr_t offset = ~0, host_off = 0, cache_off = 0, used_length = 0;
+        uint64_t host_dirty = 0, both_dirty = 0;
+
+        /* Wait for work */
+        qemu_event_wait(&colo_flush_threads.triggers[i]);
+        qemu_event_reset(&colo_flush_threads.triggers[i]);
+        if (colo_flush_threads.quit) {
+            break;
+        }
+
+        rcu_read_lock();
+        /* note offset is initialised to ~0 so 1st time through we drop
+         * through to finding a block
+         */
+        do {
+            ram_addr_t ram_addr_abs;
+            if (cache_off == offset) { /* walk ramblock->colo_cache */
+                cache_off = migration_bitmap_find_dirty(block,
+                                                        offset, &ram_addr_abs);
+                if (cache_off < used_length) {
+                    migration_bitmap_clear_dirty(ram_addr_abs);
+                }
+            }
+            if (host_off == offset) { /* walk ramblock->host */
+                host_off = ramlist_bitmap_find_and_reset_dirty(block, offset,
+                               used_length);
+            }
+            if (!block || (host_off >= used_length &&
+                cache_off >= used_length)) {
+                block = flush_find_next_block(block, lower, upper,
+                                              &offset, &used_length);
+                cache_off = host_off = offset;
+            } else {
+                if (host_off <= cache_off) {
+                    offset = host_off;
+                    host_dirty++;
+                    both_dirty += (host_off == cache_off);
+                } else {
+                    offset = cache_off;
+                }
+                dst_host = block->host + offset;
+                src_host = block->colo_cache + offset;
+                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
+            }
+        } while (block);
+        rcu_read_unlock();
+        trace_colo_flush_ram_cache_end(host_dirty, both_dirty);
+
+        qemu_event_set(&colo_flush_threads.completes[i]);
+    }
+
+    return NULL;
+}
  /*
   * colo cache: this is for secondary VM, we cache the whole
   * memory of the secondary VM, it will be called after first migration.
@@ -2729,6 +2865,7 @@ int colo_init_ram_cache(void)
  {
      RAMBlock *block;
      int64_t ram_cache_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+    int i;

      rcu_read_lock();
      QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
@@ -2753,6 +2890,20 @@ int colo_init_ram_cache(void)
      migration_dirty_pages = 0;
      memory_global_dirty_log_start();

+    colo_flush_threads.threads = g_new0(QemuThread, smp_cpus);
+    colo_flush_threads.triggers = g_new0(QemuEvent, smp_cpus);
+    colo_flush_threads.completes = g_new0(QemuEvent, smp_cpus);
+    colo_flush_threads.quit = false;
+    for (i = 0; i < smp_cpus; i++) {
+        char name[32];
+        qemu_event_init(&colo_flush_threads.triggers[i], false);
+        qemu_event_init(&colo_flush_threads.completes[i], false);
+        sprintf(name, "colofl: %d", i);
+        qemu_thread_create(&colo_flush_threads.threads[i], name,
+                           colo_flush_thread,
+                           (void *)(intptr_t)i,
+                           QEMU_THREAD_JOINABLE);
+    }
      return 0;

  out_locked:
@@ -2771,9 +2922,19 @@ void colo_release_ram_cache(void)
  {
      RAMBlock *block;
      struct BitmapRcu *bitmap = migration_bitmap_rcu;
+    int i;

      ram_cache_enable = false;

+    colo_flush_threads.quit = true;
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_set(&colo_flush_threads.triggers[i]);
+        qemu_thread_join(&colo_flush_threads.threads[i]);
+    }
+    g_free(colo_flush_threads.threads);
+    g_free(colo_flush_threads.triggers);
+    g_free(colo_flush_threads.completes);
+
      atomic_rcu_set(&migration_bitmap_rcu, NULL);
      if (bitmap) {
          call_rcu(bitmap, migration_bitmap_free, rcu);
@@ -2795,47 +2956,19 @@ void colo_release_ram_cache(void)
   */
  void colo_flush_ram_cache(void)
  {
-    RAMBlock *block = NULL;
-    void *dst_host;
-    void *src_host;
-    ram_addr_t offset = 0, host_off = 0, cache_off = 0;
-    uint64_t host_dirty = 0, both_dirty = 0;
-
+    int i;
      trace_colo_flush_ram_cache_begin(migration_dirty_pages);
      address_space_sync_dirty_bitmap(&address_space_memory);
-    rcu_read_lock();
-    block = QLIST_FIRST_RCU(&ram_list.blocks);
-    while (block) {
-        ram_addr_t ram_addr_abs;
-        if (cache_off == offset) { /* walk ramblock->colo_cache */
-            cache_off = migration_bitmap_find_dirty(block,
-                                                    offset, &ram_addr_abs);
-            if (cache_off < block->used_length) {
-                migration_bitmap_clear_dirty(ram_addr_abs);
-            }
-        }
-        if (host_off == offset) { /* walk ramblock->host */
-            host_off = ramlist_bitmap_find_and_reset_dirty(block, offset);
-        }
-        if (host_off >= block->used_length &&
-            cache_off >= block->used_length) {
-            cache_off = host_off = offset = 0;
-            block = QLIST_NEXT_RCU(block, next);
-        } else {
-            if (host_off <= cache_off) {
-                offset = host_off;
-                host_dirty++;
-                both_dirty += (host_off == cache_off);
-            } else {
-                offset = cache_off;
-            }
-            dst_host = block->host + offset;
-            src_host = block->colo_cache + offset;
-            memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
-        }
+
+    /* Kick all the flush threads to start work */
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_reset(&colo_flush_threads.completes[i]);
+        qemu_event_set(&colo_flush_threads.triggers[i]);
+    }
+    /* ...and wait for them to finish */
+    for (i = 0; i < smp_cpus; i++) {
+        qemu_event_wait(&colo_flush_threads.completes[i]);
      }
-    rcu_read_unlock();
-    trace_colo_flush_ram_cache_end(host_dirty, both_dirty);
      assert(migration_dirty_pages == 0);
  }

Re: [Qemu-devel] [WIP:COLO: 1] Flush colo ram in parallel

Reply via email to