* Peter Xu (pet...@redhat.com) wrote: > Currently we are doing log_clear() right after log_sync() which mostly > keeps the old behavior when log_clear() was still part of log_sync(). > > This patch tries to further optimize the migration log_clear() code > path to split huge log_clear()s into smaller chunks. > > We do this by spliting the whole guest memory region into memory > chunks, whose size is decided by MigrationState.clear_bitmap_shift (an > example will be given below). With that, we don't do the dirty bitmap > clear operation on the remote node (e.g., KVM) when we fetch the dirty > bitmap, instead we explicitly clear the dirty bitmap for the memory > chunk for each of the first time we send a page in that chunk. > > Here comes an example. > > Assuming the guest has 64G memory, then before this patch the KVM > ioctl KVM_CLEAR_DIRTY_LOG will be a single one covering 64G memory. > If after the patch, let's assume when the clear bitmap shift is 18, > then the memory chunk size on x86_64 will be 1UL<<18 * 4K = 1GB. Then > instead of sending a big 64G ioctl, we'll send 64 small ioctls, each > of the ioctl will cover 1G of the guest memory. For each of the 64 > small ioctls, we'll only send if any of the page in that small chunk > was going to be sent right away. > > Signed-off-by: Peter Xu <pet...@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilb...@redhat.com> > --- > include/exec/ram_addr.h | 76 +++++++++++++++++++++++++++++++++++++++-- > migration/migration.c | 4 +++ > migration/migration.h | 27 +++++++++++++++ > migration/ram.c | 44 ++++++++++++++++++++++++ > migration/trace-events | 1 + > 5 files changed, 150 insertions(+), 2 deletions(-) > > diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h > index 34141060e1..bb92b4c32c 100644 > --- a/include/exec/ram_addr.h > +++ b/include/exec/ram_addr.h > @@ -50,8 +50,70 @@ struct RAMBlock { > unsigned long *unsentmap; > /* bitmap of already received pages in postcopy */ > unsigned long *receivedmap; > + > + /* > + * bitmap to track already cleared dirty bitmap. When the bit is > + * set, it means the corresponding memory chunk needs a log-clear. > + * Set this up to non-NULL to enable the capability to postpone > + * and split clearing of dirty bitmap on the remote node (e.g., > + * KVM). The bitmap will be set only when doing global sync. > + * > + * NOTE: this bitmap is different comparing to the other bitmaps > + * in that one bit can represent multiple guest pages (which is > + * decided by the `clear_bmap_shift' variable below). On > + * destination side, this should always be NULL, and the variable > + * `clear_bmap_shift' is meaningless. > + */ > + unsigned long *clear_bmap; > + uint8_t clear_bmap_shift; > }; > > +/** > + * clear_bmap_size: calculate clear bitmap size > + * > + * @pages: number of guest pages > + * @shift: guest page number shift > + * > + * Returns: number of bits for the clear bitmap > + */ > +static inline long clear_bmap_size(uint64_t pages, uint8_t shift) > +{ > + return DIV_ROUND_UP(pages, 1UL << shift); > +} > + > +/** > + * clear_bmap_set: set clear bitmap for the page range > + * > + * @rb: the ramblock to operate on > + * @start: the start page number > + * @size: number of pages to set in the bitmap > + * > + * Returns: None > + */ > +static inline void clear_bmap_set(RAMBlock *rb, uint64_t start, > + uint64_t npages) > +{ > + uint8_t shift = rb->clear_bmap_shift; > + > + bitmap_set_atomic(rb->clear_bmap, start >> shift, > + clear_bmap_size(npages, shift)); > +} > + > +/** > + * clear_bmap_test_and_clear: test clear bitmap for the page, clear if set > + * > + * @rb: the ramblock to operate on > + * @page: the page number to check > + * > + * Returns: true if the bit was set, false otherwise > + */ > +static inline bool clear_bmap_test_and_clear(RAMBlock *rb, uint64_t page) > +{ > + uint8_t shift = rb->clear_bmap_shift; > + > + return bitmap_test_and_clear_atomic(rb->clear_bmap, page >> shift, 1); > +} > + > static inline bool offset_in_ramblock(RAMBlock *b, ram_addr_t offset) > { > return (b && b->host && offset < b->used_length) ? true : false; > @@ -462,8 +524,18 @@ uint64_t cpu_physical_memory_sync_dirty_bitmap(RAMBlock > *rb, > } > } > > - /* TODO: split the huge bitmap into smaller chunks */ > - memory_region_clear_dirty_bitmap(rb->mr, start, length); > + if (rb->clear_bmap) { > + /* > + * Postpone the dirty bitmap clear to the point before we > + * really send the pages, also we will split the clear > + * dirty procedure into smaller chunks. > + */ > + clear_bmap_set(rb, start >> TARGET_PAGE_BITS, > + length >> TARGET_PAGE_BITS); > + } else { > + /* Slow path - still do that in a huge chunk */ > + memory_region_clear_dirty_bitmap(rb->mr, start, length); > + } > } else { > ram_addr_t offset = rb->offset; > > diff --git a/migration/migration.c b/migration/migration.c > index 2865ae3fa9..8a607fe1e2 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -3362,6 +3362,8 @@ void migration_global_dump(Monitor *mon) > ms->send_section_footer ? "on" : "off"); > monitor_printf(mon, "decompress-error-check: %s\n", > ms->decompress_error_check ? "on" : "off"); > + monitor_printf(mon, "clear-bitmap-shift: %u\n", > + ms->clear_bitmap_shift); > } > > #define DEFINE_PROP_MIG_CAP(name, x) \ > @@ -3376,6 +3378,8 @@ static Property migration_properties[] = { > send_section_footer, true), > DEFINE_PROP_BOOL("decompress-error-check", MigrationState, > decompress_error_check, true), > + DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState, > + clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT), > > /* Migration parameters */ > DEFINE_PROP_UINT8("x-compress-level", MigrationState, > diff --git a/migration/migration.h b/migration/migration.h > index 780a096857..6e3178d8b2 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -27,6 +27,23 @@ struct PostcopyBlocktimeContext; > > #define MIGRATION_RESUME_ACK_VALUE (1) > > +/* > + * 1<<6=64 pages -> 256K chunk when page size is 4K. This gives us > + * the benefit that all the chunks are 64 pages aligned then the > + * bitmaps are always aligned to LONG. > + */ > +#define CLEAR_BITMAP_SHIFT_MIN 6 > +/* > + * 1<<18=256K pages -> 1G chunk when page size is 4K. This is the > + * default value to use if no one specified. > + */ > +#define CLEAR_BITMAP_SHIFT_DEFAULT 18 > +/* > + * 1<<31=2G pages -> 8T chunk when page size is 4K. This should be > + * big enough and make sure we won't overflow easily. > + */ > +#define CLEAR_BITMAP_SHIFT_MAX 31 > + > /* State for the incoming migration */ > struct MigrationIncomingState { > QEMUFile *from_src_file; > @@ -233,6 +250,16 @@ struct MigrationState > * do not trigger spurious decompression errors. > */ > bool decompress_error_check; > + > + /* > + * This decides the size of guest memory chunk that will be used > + * to track dirty bitmap clearing. The size of memory chunk will > + * be GUEST_PAGE_SIZE << N. Say, N=0 means we will clear dirty > + * bitmap for each page to send (1<<0=1); N=10 means we will clear > + * dirty bitmap only once for 1<<10=1K continuous guest pages > + * (which is in 4M chunk). > + */ > + uint8_t clear_bitmap_shift; > }; > > void migrate_set_state(int *state, int old_state, int new_state); > diff --git a/migration/ram.c b/migration/ram.c > index dc916042fb..632bf02575 100644 > --- a/migration/ram.c > +++ b/migration/ram.c > @@ -1668,6 +1668,33 @@ static inline bool > migration_bitmap_clear_dirty(RAMState *rs, > bool ret; > > qemu_mutex_lock(&rs->bitmap_mutex); > + > + /* > + * Clear dirty bitmap if needed. This _must_ be called before we > + * send any of the page in the chunk because we need to make sure > + * we can capture further page content changes when we sync dirty > + * log the next time. So as long as we are going to send any of > + * the page in the chunk we clear the remote dirty bitmap for all. > + * Clearing it earlier won't be a problem, but too late will. > + */ > + if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) { > + uint8_t shift = rb->clear_bmap_shift; > + hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift); > + hwaddr start = (page << TARGET_PAGE_BITS) & (-size); > + > + /* > + * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this > + * can make things easier sometimes since then start address > + * of the small chunk will always be 64 pages aligned so the > + * bitmap will always be aligned to unsigned long. We should > + * even be able to remove this restriction but I'm simply > + * keeping it. > + */ > + assert(shift >= 6); > + trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page); > + memory_region_clear_dirty_bitmap(rb->mr, start, size); > + } > + > ret = test_and_clear_bit(page, rb->bmap); > > if (ret) { > @@ -2685,6 +2712,8 @@ static void ram_save_cleanup(void *opaque) > memory_global_dirty_log_stop(); > > RAMBLOCK_FOREACH_NOT_IGNORED(block) { > + g_free(block->clear_bmap); > + block->clear_bmap = NULL; > g_free(block->bmap); > block->bmap = NULL; > g_free(block->unsentmap); > @@ -3195,15 +3224,30 @@ static int ram_state_init(RAMState **rsp) > > static void ram_list_init_bitmaps(void) > { > + MigrationState *ms = migrate_get_current(); > RAMBlock *block; > unsigned long pages; > + uint8_t shift; > > /* Skip setting bitmap if there is no RAM */ > if (ram_bytes_total()) { > + shift = ms->clear_bitmap_shift; > + if (shift > CLEAR_BITMAP_SHIFT_MAX) { > + error_report("clear_bitmap_shift (%u) too big, using " > + "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX); > + shift = CLEAR_BITMAP_SHIFT_MAX; > + } else if (shift < CLEAR_BITMAP_SHIFT_MIN) { > + error_report("clear_bitmap_shift (%u) too small, using " > + "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN); > + shift = CLEAR_BITMAP_SHIFT_MIN; > + } > + > RAMBLOCK_FOREACH_NOT_IGNORED(block) { > pages = block->max_length >> TARGET_PAGE_BITS; > block->bmap = bitmap_new(pages); > bitmap_set(block->bmap, 0, pages); > + block->clear_bmap_shift = shift; > + block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift)); > if (migrate_postcopy_ram()) { > block->unsentmap = bitmap_new(pages); > bitmap_set(block->unsentmap, 0, pages); > diff --git a/migration/trace-events b/migration/trace-events > index de2e136e57..2c45974330 100644 > --- a/migration/trace-events > +++ b/migration/trace-events > @@ -79,6 +79,7 @@ get_queued_page(const char *block_name, uint64_t > tmp_offset, unsigned long page_ > get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, > unsigned long page_abs, int sent) "%s/0x%" PRIx64 " page_abs=0x%lx (sent=%d)" > migration_bitmap_sync_start(void) "" > migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64 > +migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, > unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx" > migration_throttle(void) "" > multifd_recv(uint8_t id, uint64_t packet_num, uint32_t used, uint32_t flags, > uint32_t next_packet_size) "channel %d packet number %" PRIu64 " pages %d > flags 0x%x next packet size %d" > multifd_recv_sync_main(long packet_num) "packet num %ld" > -- > 2.17.1 > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK