CCing migration maintainers, feedback otherwise in-line.
On 12/11/2014 09:17 AM, Vladimir Sementsov-Ogievskiy wrote:
Just migrate parts of dirty bitmaps, corresponding to the block being
migrated. Also, skip block migration if it is disabled (blk parameter
of migrate command is false).
Skipping shared sectors: bitmaps are migrated independently of this,
just send blk without block data.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@parallels.com>
---
In terms of general approach, migrating the dirty bitmap alongside the
blocks it describes makes sense when migrating both.
Is this a lot of overhead when that's not the case, though? If we
utilize the "bitmap only" pathways added here, don't we iterate through
the savevm handlers a lot to only transfer very little data per section?
If we really need migration of bitmaps apart from the data they
describe, is it not worth developing a more condensed transfer mechanism
to get more bitmap data per iteration, instead of just one block's worth?
block-migration.c | 173 +++++++++++++++++++++++++++++++++++++++++++++++-------
savevm.c | 1 +
2 files changed, 154 insertions(+), 20 deletions(-)
diff --git a/block-migration.c b/block-migration.c
index 908a66d..95d54a1 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -32,6 +32,8 @@
#define BLK_MIG_FLAG_EOS 0x02
#define BLK_MIG_FLAG_PROGRESS 0x04
#define BLK_MIG_FLAG_ZERO_BLOCK 0x08
+#define BLK_MIG_FLAG_HAS_BLOCK 0x10
+#define BLK_MIG_FLAG_HAS_BITMAPS 0x20
OK: As a result of allowing bitmaps to be migrated without the block
data itself, we now must acknowledge the concept that we can send either
block data, bitmaps, both, or neither -- so new defines are added here
to indicate what data can be found in the section following.
#define MAX_IS_ALLOCATED_SEARCH 65536
@@ -51,6 +53,8 @@ typedef struct BlkMigDevState {
int shared_base;
int64_t total_sectors;
QSIMPLEQ_ENTRY(BlkMigDevState) entry;
+ int nr_bitmaps;
+ BdrvDirtyBitmap **dirty_bitmaps;
/* Only used by migration thread. Does not need a lock. */
int bulk_completed;
@@ -64,6 +68,11 @@ typedef struct BlkMigDevState {
Error *blocker;
} BlkMigDevState;
+typedef struct BlkMigDirtyBitmap {
+ BdrvDirtyBitmap *bitmap;
+ uint8_t *buf;
+} BlkMigDirtyBitmap;
+
typedef struct BlkMigBlock {
/* Only used by migration thread. */
uint8_t *buf;
@@ -74,6 +83,9 @@ typedef struct BlkMigBlock {
QEMUIOVector qiov;
BlockAIOCB *aiocb;
+ int nr_bitmaps;
+ BlkMigDirtyBitmap *dirty_bitmaps;
+
/* Protected by block migration lock. */
int ret;
QSIMPLEQ_ENTRY(BlkMigBlock) entry;
@@ -83,6 +95,7 @@ typedef struct BlkMigState {
/* Written during setup phase. Can be read without a lock. */
int blk_enable;
int shared_base;
+ int dbm_enable;
Similar to the feedback in a previous patch, we may not want to use
'dbm' to mean "Dirty Bitmap" because we have not been applying the
abbreviation consistently.
For now, the recommendation from stefan is to use the full
"bdrv_dirty_bitmap" or "dirty_bitmap" in function names.
If we do want an acronym to refer to this particular type of dirty
bitmap, we should apply it consistently to all functions that work with
the BdrvDirtyBitmap type.
For now, "bdrv_dirty_bitmap_enable" should suffice, even though it's a
bit wordy.
QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
int64_t total_sector_sum;
bool zero_blocks;
@@ -116,27 +129,63 @@ static void blk_mig_unlock(void)
/* Only allocating and initializing structure fields, not copying any data. */
static BlkMigBlock *blk_create(BlkMigDevState *bmds, int64_t sector,
- int nr_sectors)
+ int nr_sectors, bool only_bitmaps)
{
+ int i;
BlkMigBlock *blk = g_new(BlkMigBlock, 1);
- blk->buf = g_malloc(BLOCK_SIZE);
+ blk->buf = only_bitmaps ? NULL : g_malloc(BLOCK_SIZE);
blk->bmds = bmds;
blk->sector = sector;
blk->nr_sectors = nr_sectors;
+ blk->dirty_bitmaps = NULL;
+ blk->nr_bitmaps = 0;
blk->iov.iov_base = blk->buf;
blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
We can probably skip this block if only_bitmaps is true.
+ if (block_mig_state.dbm_enable) {
+ BlkMigDirtyBitmap *mig_bm;
+
+ blk->nr_bitmaps = bmds->nr_bitmaps;
+ mig_bm = blk->dirty_bitmaps = g_new(BlkMigDirtyBitmap,
+ bmds->nr_bitmaps);
+ for (i = 0; i < bmds->nr_bitmaps; ++i) {
+ BdrvDirtyBitmap *bm = bmds->dirty_bitmaps[i];
+ mig_bm->buf = g_malloc(bdrv_dbm_data_size(bm, nr_sectors));
+ mig_bm->bitmap = bm;
+ mig_bm++;
+ }
+ }
+
It's strange to me that we'd give it an "only bitmaps" boolean and then
rely on a different condition to populate them. Is it not worthy of an
error if we specify only_bitmaps when block_mig_state.dbm_enable is false?
return blk;
}
static void blk_free(BlkMigBlock *blk)
{
+ int i;
g_free(blk->buf);
+
+ if (blk->dirty_bitmaps) {
+ for (i = 0; i < blk->nr_bitmaps; ++i) {
+ g_free(blk->dirty_bitmaps[i].buf);
+ }
+ g_free(blk->dirty_bitmaps);
+ }
+
g_free(blk);
}
+static void blk_store_bitmaps(BlkMigBlock *blk)
+{
+ int i;
+ for (i = 0; i < blk->nr_bitmaps; ++i) {
+ bdrv_dbm_store_data(blk->dirty_bitmaps[i].bitmap,
+ blk->dirty_bitmaps[i].buf,
+ blk->sector, blk->nr_sectors);
+ }
+}
+
I am worried about the case in which blk->nr_sectors is not an integral
multiple of the bitmap sector granularity, for reasons discussed in the
previous patches.
I'm not positive it IS a problem either, but maybe you have already
thought about this.
/* Must run outside of the iothread lock during the bulk phase,
* or the VM will stall.
*/
@@ -146,9 +195,15 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk)
int len;
uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
- if (block_mig_state.zero_blocks &&
+ if (block_mig_state.zero_blocks && blk->buf &&
buffer_is_zero(blk->buf, BLOCK_SIZE)) {
flags |= BLK_MIG_FLAG_ZERO_BLOCK;
+ } else if (blk->buf) {
+ flags |= BLK_MIG_FLAG_HAS_BLOCK;
+ }
+
+ if (block_mig_state.dbm_enable) {
+ flags |= BLK_MIG_FLAG_HAS_BITMAPS;
}
/* sector number and flags */
@@ -160,10 +215,27 @@ static void blk_send(QEMUFile *f, BlkMigBlock * blk)
qemu_put_byte(f, len);
qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
+ /* dirty bitmaps */
+ if (flags & BLK_MIG_FLAG_HAS_BITMAPS) {
+ int i;
+ qemu_put_be32(f, blk->nr_bitmaps);
+ for (i = 0; i < blk->nr_bitmaps; ++i) {
+ BdrvDirtyBitmap *bm = blk->dirty_bitmaps[i].bitmap;
+ int buf_size = bdrv_dbm_data_size(bm, blk->nr_sectors);
+ const char *name = bdrv_dirty_bitmap_name(bm);
+ int len = strlen(name);
+
+ qemu_put_byte(f, len);
+ qemu_put_buffer(f, (const uint8_t *)name, len);
No length in the stream for the size of the dirty bitmap buffer?
+ qemu_put_buffer(f, blk->dirty_bitmaps[i].buf, buf_size);
+ }
+ }
+
Future patch idea: We can probably optimize this block by testing a
range of sectors on the bitmap and skipping bitmaps that don't have data
for this block.
Since we populated the bitmaps previously, we should already have the
chance to have counted how many of them are nonempty, so we can use that
number instead of blk->nr_bitmaps.
Not important for the series right now.
/* if a block is zero we need to flush here since the network
* bandwidth is now a lot higher than the storage device bandwidth.
- * thus if we queue zero blocks we slow down the migration */
- if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
+ * thus if we queue zero blocks we slow down the migration.
+ * also, skip writing block when migrate only dirty bitmaps. */
+ if (!(flags & BLK_MIG_FLAG_HAS_BLOCK)) {
I'm not completely clear on the reasoning behind the original codeblock
here, but changing it from "Only when zeroes" to "Not when we have data:
Zeroes and Bitmaps" makes sense, since bitmap-only sections are going to
be very sparse too.
qemu_fflush(f);
return;
}
A little after this, there's a call to qemu_put_buffer(f, blk->buf,
BLOCK_SIZE), but we have the case where blk->buf may be NULL in bitmap
only cases. I think we need to guard against that.
@@ -282,13 +354,20 @@ static int mig_save_device_bulk(QEMUFile *f,
BlkMigDevState *bmds)
BlockDriverState *bs = bmds->bs;
BlkMigBlock *blk;
int nr_sectors;
+ bool skip_chunk = false;
if (bmds->shared_base) {
qemu_mutex_lock_iothread();
- while (cur_sector < total_sectors &&
- !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
- &nr_sectors)) {
- cur_sector += nr_sectors;
+ if (block_mig_state.dbm_enable) {
+ bdrv_is_allocated(bs, cur_sector, BDRV_SECTORS_PER_DIRTY_CHUNK,
+ &nr_sectors);
+ skip_chunk = nr_sectors >= BDRV_SECTORS_PER_DIRTY_CHUNK;
+ } else {
+ while (cur_sector < total_sectors &&
+ !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
+ &nr_sectors)) {
+ cur_sector += nr_sectors;
+ }
OK, so the approach taken here is that if bitmaps are enabled, we check
only to see if this current chunk is allocated or not. If it isn't, we
declare this a "bitmap only" chunk and set skip_chunk to true.
The else clause taken implies no bitmap data, because either dbm_enable
or blk_enable (or both) MUST be set for us to be here.
(1) Why are you not checking the return value of bdrv_is_allocated? It
could return either true or false AND nr_sectors ==
BDRV_SECTORS_PER_DIRTY_CHUNK, provided it was entirely allocated or
entirely unallocated, so I think this condition is not working as you
intend it to.
(2) This seems slightly hackey in a sparse or no-data situation. We will
transmit many small data sections, each containing a chunk of bitmap
data, one at a time, with no data interspersed -- which leaves a high
metadata-to-data ratio in these cases.
Would it be an involved process to alter the nr_sectors count to be
something that isn't a single sector in the (dbm_enable && !blk_enable)
case? That way we can spin until we find data worth transferring and
transfer all of the bitmap metadata in a larger chunk all at once.
Whatever approach you choose, it might be nice to have a comment in-line
explaining the general approach.
}
qemu_mutex_unlock_iothread();
}
@@ -309,7 +388,8 @@ static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState
*bmds)
nr_sectors = total_sectors - cur_sector;
}
- blk = blk_create(bmds, cur_sector, nr_sectors);
+ blk = blk_create(bmds, cur_sector, nr_sectors,
+ skip_chunk || !block_mig_state.blk_enable);
blk_mig_lock();
block_mig_state.submitted++;
@@ -317,8 +397,16 @@ static int mig_save_device_bulk(QEMUFile *f,
BlkMigDevState *bmds)
bdrv_reset_dirty_bitmap(bs, bmds->dirty_bitmap, cur_sector, nr_sectors);
- blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
- nr_sectors, blk_mig_read_cb, blk);
+ if (block_mig_state.dbm_enable) {
+ blk_store_bitmaps(blk);
+ }
+
+ if (block_mig_state.blk_enable && !skip_chunk) {
+ blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
+ nr_sectors, blk_mig_read_cb, blk);
+ } else if (block_mig_state.dbm_enable) {
+ blk_mig_read_cb(blk, 0);
+ }
bmds->cur_sector = cur_sector + nr_sectors;
return (bmds->cur_sector >= total_sectors);
@@ -403,6 +491,8 @@ static void init_blk_migration(QEMUFile *f)
DPRINTF("Start full migration for %s\n",
bdrv_get_device_name(bs));
}
+ bmds->dirty_bitmaps = bdrv_dbm_find_all_named(bs, &bmds->nr_bitmaps);
+
At this point, we should have block_mig_state.dbm_enable set (or
disabled), so we can skip this initialization if it is not set.
QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
}
}
@@ -481,20 +571,32 @@ static int mig_save_device_dirty(QEMUFile *f,
BlkMigDevState *bmds,
} else {
nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
}
- blk = blk_create(bmds, sector, nr_sectors);
+ blk = blk_create(bmds, sector, nr_sectors,
+ !block_mig_state.blk_enable);
+
+ if (block_mig_state.dbm_enable) {
+ blk_store_bitmaps(blk);
+ }
if (is_async) {
- blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
- nr_sectors, blk_mig_read_cb, blk);
+ if (block_mig_state.blk_enable) {
+ blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
+ nr_sectors, blk_mig_read_cb,
+ blk);
+ } else {
+ blk_mig_read_cb(blk, 0);
+ }
blk_mig_lock();
block_mig_state.submitted++;
bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
blk_mig_unlock();
} else {
- ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
- if (ret < 0) {
- goto error;
+ if (block_mig_state.blk_enable) {
+ ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
+ if (ret < 0) {
+ goto error;
+ }
}
blk_send(f, blk);
@@ -817,10 +919,39 @@ static int block_load(QEMUFile *f, void *opaque, int
version_id)
nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
}
+ /* load dirty bitmaps */
+ if (flags & BLK_MIG_FLAG_HAS_BITMAPS) {
+ int i, nr_bitmaps = qemu_get_be32(f);
+
+ for (i = 0; i < nr_bitmaps; ++i) {
+ char bitmap_name[256];
+ BdrvDirtyBitmap *bitmap;
+ int buf_size, len;
+
+ len = qemu_get_byte(f);
+ qemu_get_buffer(f, (uint8_t *)bitmap_name, len);
+ bitmap_name[len] = '\0';
+
+ bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
+ if (!bitmap) {
+ fprintf(stderr, "Error unknown dirty bitmap\
+ %s for block device %s\n",
+ bitmap_name, device_name);
+ return -EINVAL;
+ }
+
+ buf_size = bdrv_dbm_data_size(bitmap, nr_sectors);
Oh, this is why you didn't store the length... Still, since it seems as
if you expect the bitmap to already exist on the destination, what if
the granularity or size is different? It might be nice to explicitly
state the size of the buffer -- one user misconfiguration and you might
blow the rest of the migration.
This way, if the calculated size doesn't match the received size, you
have the chance to throw a meaningful error.
+ buf = g_malloc(buf_size);
+ qemu_get_buffer(f, buf, buf_size);
+ bdrv_dbm_restore_data(bitmap, buf, addr, nr_sectors);
+ g_free(buf);
+ }
+ }
+
So with this approach, the user needs to have created the bitmaps
already? Would it be possible to create a mode where they don't, or am I
misreading something?
We could re-create them on the fly whenever the offset is 0 on the
receiving side, the only other piece of information we'd really need at
that point is the granularity.
if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
ret = bdrv_write_zeroes(bs, addr, nr_sectors,
BDRV_REQ_MAY_UNMAP);
- } else {
+ } else if (flags & BLK_MIG_FLAG_HAS_BLOCK) {
buf = g_malloc(BLOCK_SIZE);
qemu_get_buffer(f, buf, BLOCK_SIZE);
ret = bdrv_write(bs, addr, buf, nr_sectors);
@@ -855,6 +986,7 @@ static void block_set_params(const MigrationParams *params,
void *opaque)
{
block_mig_state.blk_enable = params->blk;
block_mig_state.shared_base = params->shared;
+ block_mig_state.dbm_enable = params->dirty;
/* shared base means that blk_enable = 1 */
block_mig_state.blk_enable |= params->shared;
@@ -862,7 +994,8 @@ static void block_set_params(const MigrationParams *params,
void *opaque)
static bool block_is_active(void *opaque)
{
- return block_mig_state.blk_enable == 1;
+ return block_mig_state.blk_enable == 1 ||
+ block_mig_state.dbm_enable == 1;
}
static SaveVMHandlers savevm_block_handlers = {
diff --git a/savevm.c b/savevm.c
index a598d1d..1299faa 100644
--- a/savevm.c
+++ b/savevm.c
@@ -983,6 +983,7 @@ int qemu_loadvm_state(QEMUFile *f)
}
}
+ bdrv_dbm_restore_finish();
Do we not have a cleaner way to call this? I guess we have no explicit
callback mechanisms for when a particular BDS is completely done,
because of the multiple passes over the data that we perform --
Still, it might be nicer to have a callback for the whole of block
migration, and call bdrv_dbm_restore_finish() from there instead of
putting this implementation detail all the way up inside of
qemu_loadvm_state.
Can we migrate an empty-data sentinel section from block migration upon
completion that qemu_loadvm_state launches a post_load method for that
can invoke bdrv_dbm_restore_finish for us?
Alternatively, can this not be safely done inside of block_load?
somewhere around here:
printf("Completed %d %%%c", (int)addr,
(addr == 100) ? '\n' : '\r');
fflush(stdout);
The "100%" completion flag is only written in block_save_complete(),
so once we read this flag we should have necessary and sufficient
information to conclude that it's safe to do the BdrvDirtyBitmap
migration post-load stuff.
Of course, maybe generic section post-load callbacks are useful to other
devices, and I'd be fine with either approach.
CCing migration people for opinions.
cpu_synchronize_all_post_init();
ret = 0;
Thank you, and sorry it took me so long to digest the series!
--John Snow
P.S.: Happy New Year! I hope 2015 treats you well :~)