Re: [Qemu-devel] [PATCH v4 11/11] migration: Split log_clear() into smaller chunks

Dr. David Alan Gilbert Mon, 03 Jun 2019 02:58:11 -0700

* Peter Xu (pet...@redhat.com) wrote:
> Currently we are doing log_clear() right after log_sync() which mostly
> keeps the old behavior when log_clear() was still part of log_sync().
> 
> This patch tries to further optimize the migration log_clear() code
> path to split huge log_clear()s into smaller chunks.
> 
> We do this by spliting the whole guest memory region into memory
> chunks, whose size is decided by MigrationState.clear_bitmap_shift (an
> example will be given below).  With that, we don't do the dirty bitmap
> clear operation on the remote node (e.g., KVM) when we fetch the dirty
> bitmap, instead we explicitly clear the dirty bitmap for the memory
> chunk for each of the first time we send a page in that chunk.
> 
> Here comes an example.
> 
> Assuming the guest has 64G memory, then before this patch the KVM
> ioctl KVM_CLEAR_DIRTY_LOG will be a single one covering 64G memory.
> If after the patch, let's assume when the clear bitmap shift is 18,
> then the memory chunk size on x86_64 will be 1UL<<18 * 4K = 1GB.  Then
> instead of sending a big 64G ioctl, we'll send 64 small ioctls, each
> of the ioctl will cover 1G of the guest memory.  For each of the 64
> small ioctls, we'll only send if any of the page in that small chunk
> was going to be sent right away.
> 
> Signed-off-by: Peter Xu <pet...@redhat.com>


Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com>

> ---
>  include/exec/ram_addr.h | 76 +++++++++++++++++++++++++++++++++++++++--
>  migration/migration.c   |  4 +++
>  migration/migration.h   | 27 +++++++++++++++
>  migration/ram.c         | 44 ++++++++++++++++++++++++
>  migration/trace-events  |  1 +
>  5 files changed, 150 insertions(+), 2 deletions(-)
> 
> diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
> index 34141060e1..bb92b4c32c 100644
> --- a/include/exec/ram_addr.h
> +++ b/include/exec/ram_addr.h
> @@ -50,8 +50,70 @@ struct RAMBlock {
>      unsigned long *unsentmap;
>      /* bitmap of already received pages in postcopy */
>      unsigned long *receivedmap;
> +
> +    /*
> +     * bitmap to track already cleared dirty bitmap.  When the bit is
> +     * set, it means the corresponding memory chunk needs a log-clear.
> +     * Set this up to non-NULL to enable the capability to postpone
> +     * and split clearing of dirty bitmap on the remote node (e.g.,
> +     * KVM).  The bitmap will be set only when doing global sync.
> +     *
> +     * NOTE: this bitmap is different comparing to the other bitmaps
> +     * in that one bit can represent multiple guest pages (which is
> +     * decided by the `clear_bmap_shift' variable below).  On
> +     * destination side, this should always be NULL, and the variable
> +     * `clear_bmap_shift' is meaningless.
> +     */
> +    unsigned long *clear_bmap;
> +    uint8_t clear_bmap_shift;
>  };
>  
> +/**
> + * clear_bmap_size: calculate clear bitmap size
> + *
> + * @pages: number of guest pages
> + * @shift: guest page number shift
> + *
> + * Returns: number of bits for the clear bitmap
> + */
> +static inline long clear_bmap_size(uint64_t pages, uint8_t shift)
> +{
> +    return DIV_ROUND_UP(pages, 1UL << shift);
> +}
> +
> +/**
> + * clear_bmap_set: set clear bitmap for the page range
> + *
> + * @rb: the ramblock to operate on
> + * @start: the start page number
> + * @size: number of pages to set in the bitmap
> + *
> + * Returns: None
> + */
> +static inline void clear_bmap_set(RAMBlock *rb, uint64_t start,
> +                                  uint64_t npages)
> +{
> +    uint8_t shift = rb->clear_bmap_shift;
> +
> +    bitmap_set_atomic(rb->clear_bmap, start >> shift,
> +                      clear_bmap_size(npages, shift));
> +}
> +
> +/**
> + * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set
> + *
> + * @rb: the ramblock to operate on
> + * @page: the page number to check
> + *
> + * Returns: true if the bit was set, false otherwise
> + */
> +static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page)
> +{
> +    uint8_t shift = rb->clear_bmap_shift;
> +
> +    return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1);
> +}
> +
>  static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset)
>  {
>      return (b && b->host && offset < b->used_length) ? true : false;
> @@ -462,8 +524,18 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock 
> *rb,
>              }
>          }
>  
> -        /* TODO: split the huge bitmap into smaller chunks */
> -        memory_region_clear_dirty_bitmap(rb->mr, start, length);
> +        if (rb->clear_bmap) {
> +            /*
> +             * Postpone the dirty bitmap clear to the point before we
> +             * really send the pages, also we will split the clear
> +             * dirty procedure into smaller chunks.
> +             */
> +            clear_bmap_set(rb, start >> TARGET_PAGE_BITS,
> +                           length >> TARGET_PAGE_BITS);
> +        } else {
> +            /* Slow path - still do that in a huge chunk */
> +            memory_region_clear_dirty_bitmap(rb->mr, start, length);
> +        }
>      } else {
>          ram_addr_t offset = rb->offset;
>  
> diff --git a/migration/migration.c b/migration/migration.c
> index 2865ae3fa9..8a607fe1e2 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -3362,6 +3362,8 @@ void migration_global_dump(Monitor *mon)
>                     ms->send_section_footer ? "on" : "off");
>      monitor_printf(mon, "decompress-error-check: %s\n",
>                     ms->decompress_error_check ? "on" : "off");
> +    monitor_printf(mon, "clear-bitmap-shift: %u\n",
> +                   ms->clear_bitmap_shift);
>  }
>  
>  #define DEFINE_PROP_MIG_CAP(name, x)             \
> @@ -3376,6 +3378,8 @@ static Property migration_properties[] = {
>                       send_section_footer, true),
>      DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
>                        decompress_error_check, true),
> +    DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
> +                      clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
>  
>      /* Migration parameters */
>      DEFINE_PROP_UINT8("x-compress-level", MigrationState,
> diff --git a/migration/migration.h b/migration/migration.h
> index 780a096857..6e3178d8b2 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -27,6 +27,23 @@ struct PostcopyBlocktimeContext;
>  
>  #define  MIGRATION_RESUME_ACK_VALUE  (1)
>  
> +/*
> + * 1<<6=64 pages -> 256K chunk when page size is 4K.  This gives us
> + * the benefit that all the chunks are 64 pages aligned then the
> + * bitmaps are always aligned to LONG.
> + */
> +#define CLEAR_BITMAP_SHIFT_MIN             6
> +/*
> + * 1<<18=256K pages -> 1G chunk when page size is 4K.  This is the
> + * default value to use if no one specified.
> + */
> +#define CLEAR_BITMAP_SHIFT_DEFAULT        18
> +/*
> + * 1<<31=2G pages -> 8T chunk when page size is 4K.  This should be
> + * big enough and make sure we won't overflow easily.
> + */
> +#define CLEAR_BITMAP_SHIFT_MAX            31
> +
>  /* State for the incoming migration */
>  struct MigrationIncomingState {
>      QEMUFile *from_src_file;
> @@ -233,6 +250,16 @@ struct MigrationState
>       * do not trigger spurious decompression errors.
>       */
>      bool decompress_error_check;
> +
> +    /*
> +     * This decides the size of guest memory chunk that will be used
> +     * to track dirty bitmap clearing.  The size of memory chunk will
> +     * be GUEST_PAGE_SIZE << N.  Say, N=0 means we will clear dirty
> +     * bitmap for each page to send (1<<0=1); N=10 means we will clear
> +     * dirty bitmap only once for 1<<10=1K continuous guest pages
> +     * (which is in 4M chunk).
> +     */
> +    uint8_t clear_bitmap_shift;
>  };
>  
>  void migrate_set_state(int *state, int old_state, int new_state);
> diff --git a/migration/ram.c b/migration/ram.c
> index dc916042fb..632bf02575 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1668,6 +1668,33 @@ static inline bool 
> migration_bitmap_clear_dirty(RAMState *rs,
>      bool ret;
>  
>      qemu_mutex_lock(&rs->bitmap_mutex);
> +
> +    /*
> +     * Clear dirty bitmap if needed.  This _must_ be called before we
> +     * send any of the page in the chunk because we need to make sure
> +     * we can capture further page content changes when we sync dirty
> +     * log the next time.  So as long as we are going to send any of
> +     * the page in the chunk we clear the remote dirty bitmap for all.
> +     * Clearing it earlier won't be a problem, but too late will.
> +     */
> +    if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
> +        uint8_t shift = rb->clear_bmap_shift;
> +        hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
> +        hwaddr start = (page << TARGET_PAGE_BITS) & (-size);
> +
> +        /*
> +         * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
> +         * can make things easier sometimes since then start address
> +         * of the small chunk will always be 64 pages aligned so the
> +         * bitmap will always be aligned to unsigned long.  We should
> +         * even be able to remove this restriction but I'm simply
> +         * keeping it.
> +         */
> +        assert(shift >= 6);
> +        trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
> +        memory_region_clear_dirty_bitmap(rb->mr, start, size);
> +    }
> +
>      ret = test_and_clear_bit(page, rb->bmap);
>  
>      if (ret) {
> @@ -2685,6 +2712,8 @@ static void ram_save_cleanup(void *opaque)
>      memory_global_dirty_log_stop();
>  
>      RAMBLOCK_FOREACH_NOT_IGNORED(block) {
> +        g_free(block->clear_bmap);
> +        block->clear_bmap = NULL;
>          g_free(block->bmap);
>          block->bmap = NULL;
>          g_free(block->unsentmap);
> @@ -3195,15 +3224,30 @@ static int ram_state_init(RAMState **rsp)
>  
>  static void ram_list_init_bitmaps(void)
>  {
> +    MigrationState *ms = migrate_get_current();
>      RAMBlock *block;
>      unsigned long pages;
> +    uint8_t shift;
>  
>      /* Skip setting bitmap if there is no RAM */
>      if (ram_bytes_total()) {
> +        shift = ms->clear_bitmap_shift;
> +        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
> +            error_report("clear_bitmap_shift (%u) too big, using "
> +                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
> +            shift = CLEAR_BITMAP_SHIFT_MAX;
> +        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
> +            error_report("clear_bitmap_shift (%u) too small, using "
> +                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
> +            shift = CLEAR_BITMAP_SHIFT_MIN;
> +        }
> +
>          RAMBLOCK_FOREACH_NOT_IGNORED(block) {
>              pages = block->max_length >> TARGET_PAGE_BITS;
>              block->bmap = bitmap_new(pages);
>              bitmap_set(block->bmap, 0, pages);
> +            block->clear_bmap_shift = shift;
> +            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>              if (migrate_postcopy_ram()) {
>                  block->unsentmap = bitmap_new(pages);
>                  bitmap_set(block->unsentmap, 0, pages);
> diff --git a/migration/trace-events b/migration/trace-events
> index de2e136e57..2c45974330 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -79,6 +79,7 @@ get_queued_page(const char *block_name, uint64_t 
> tmp_offset, unsigned long page_
>  get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, 
> unsigned long page_abs, int sent) "%s/0x%" PRIx64 " page_abs=0x%lx (sent=%d)"
>  migration_bitmap_sync_start(void) ""
>  migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
> +migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, 
> unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
>  migration_throttle(void) ""
>  multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, 
> uint32_t next_packet_size) "channel %d packet number %" PRIu64 " pages %d 
> flags 0x%x next packet size %d"
>  multifd_recv_sync_main(long packet_num) "packet num %ld"
> -- 
> 2.17.1
> 
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK

Re: [Qemu-devel] [PATCH v4 11/11] migration: Split log_clear() into smaller chunks

Reply via email to