Support live image copy + switch. That is, copy an image backing
a guest hard disk to a destination image (destination image must
be created separately), and switch to this copy.
Command syntax:
block_copy device filename [commit_filename] [-i] -- live block copy device to
image
optional commit filename
-i for incremental copy (base image shared between src and
destination)
Signed-off-by: Marcelo Tosatti<mtosa...@redhat.com>
Index: qemu/block-copy.c
===================================================================
--- /dev/null
+++ qemu/block-copy.c
@@ -0,0 +1,741 @@
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti<mtosa...@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "blockdev.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include<assert.h>
+
+#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK<< BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+ STAGE_BULK,
+ STAGE_BULK_FINISHED,
+ STAGE_DIRTY,
+ STAGE_SWITCH_FINISHED,
+};
+
+typedef struct BdrvCopyState {
+ BlockDriverState *src;
+ BlockDriverState *dst;
+ bool shared_base;
+
+ int64_t curr_sector;
+ int64_t completed_sectors;
+ int64_t nr_sectors;
+
+ enum BdrvCopyStage stage;
+ int inflight_reads;
+ int error;
+ int failed;
+ int cancelled;
+ QLIST_HEAD(, BdrvCopyBlock) io_list;
+ unsigned long *aio_bitmap;
+ QEMUTimer *aio_timer;
+ QLIST_ENTRY(BdrvCopyState) list;
+
+ int64_t blocks;
+ int64_t total_time;
+
+ char src_device_name[32];
+ char dst_filename[1024];
+ int commit_fd;
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+ BdrvCopyState *state;
+ uint8_t *buf;
+ int64_t sector;
+ int64_t nr_sectors;
+ struct iovec iov;
+ QEMUIOVector qiov;
+ BlockDriverAIOCB *aiocb;
+ int64_t time;
+ QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+ QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+ BlockDriverState *bs = s->src;
+ int64_t bitmap_size;
+
+ bitmap_size = (bdrv_getlength(bs)>> BDRV_SECTOR_BITS) +
+ BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+ bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+ s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+ int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ if (s->aio_bitmap&&
+ (sector<< BDRV_SECTOR_BITS)< bdrv_getlength(s->src)) {
+ return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)]&
+ (1UL<< (chunk % (sizeof(unsigned long) * 8))));
+ } else {
+ return 0;
+ }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+ int nb_sectors, int set)
+{
+ int64_t start, end;
+ unsigned long val, idx, bit;
+
+ start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+ end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ for (; start<= end; start++) {
+ idx = start / (sizeof(unsigned long) * 8);
+ bit = start % (sizeof(unsigned long) * 8);
+ val = s->aio_bitmap[idx];
+ if (set) {
+ if (!(val& (1UL<< bit))) {
+ val |= 1UL<< bit;
+ }
+ } else {
+ if (val& (1UL<< bit)) {
+ val&= ~(1UL<< bit);
+ }
+ }
+ s->aio_bitmap[idx] = val;
+ }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+ s->stage = stage;
+
+ switch (stage) {
+ case STAGE_BULK:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+ break;
+ case STAGE_BULK_FINISHED:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+ break;
+ case STAGE_DIRTY:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+ break;
+ case STAGE_SWITCH_FINISHED:
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+ break;
+ default:
+ break;
+ }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+ s->error = ret;
+ qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+ s->blocks++;
+ s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+ BdrvCopyBlock *blk = opaque;
+ BdrvCopyState *s = blk->state;
+
+ if (ret< 0) {
+ QLIST_REMOVE(blk, list);
+ qemu_free(blk->buf);
+ qemu_free(blk);
+ blk_copy_handle_cb_error(s, ret);
+ return;
+ }
+
+ QLIST_REMOVE(blk, list);
+ add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+ /* schedule switch to STAGE_DIRTY on last bulk write completion */
+ if (blk->state->stage == STAGE_BULK_FINISHED) {
+ qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+ }
+
+ if (blk->state->stage> STAGE_BULK_FINISHED) {
+ set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+ }
+
+ qemu_free(blk->buf);
+ qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+ BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+ blk->state = s;
+ blk->sector = read_blk->sector;
+ blk->nr_sectors = read_blk->nr_sectors;
+ blk->time = read_blk->time;
+ blk->buf = read_blk->buf;
+ QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+ blk->iov.iov_base = read_blk->buf;
+ blk->iov.iov_len = read_blk->iov.iov_len;
+ qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
+
+ BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+ blk->aiocb = bdrv_aio_writev(s->dst, blk->sector,&blk->qiov,
+ blk->iov.iov_len / BDRV_SECTOR_SIZE,
+ blk_copy_write_cb, blk);
+ if (!blk->aiocb) {
+ s->error = 1;
+ goto error;
+ }
+
+ return;
+
+error:
+ QLIST_REMOVE(blk, list);
+ qemu_free(read_blk->buf);
+ qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+ BdrvCopyBlock *blk = opaque;
+ BdrvCopyState *s = blk->state;
+
+ s->inflight_reads--;
+ if (ret< 0) {
+ QLIST_REMOVE(blk, list);
+ qemu_free(blk->buf);
+ qemu_free(blk);
+ blk_copy_handle_cb_error(s, ret);
+ return;
+ }
+ blk_copy_issue_write(s, blk);
+ QLIST_REMOVE(blk, list);
+ qemu_free(blk);
+ qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+ int nr_sectors)
+{
+ BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+ blk->buf = qemu_mallocz(BLOCK_SIZE);
+ blk->state = s;
+ blk->sector = sector;
+ blk->nr_sectors = nr_sectors;
+ QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+ blk->iov.iov_base = blk->buf;
+ blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+ qemu_iovec_init_external(&blk->qiov,&blk->iov, 1);
+
+ s->inflight_reads++;
+ blk->time = qemu_get_clock_ns(rt_clock);
+ blk->aiocb = bdrv_aio_readv(s->src, sector,&blk->qiov, nr_sectors,
+ blk_copy_read_cb, blk);
+ if (!blk->aiocb) {
+ s->error = 1;
+ goto error;
+ }
+
+ return;
+
+error:
+ s->inflight_reads--;
+ QLIST_REMOVE(blk, list);
+ qemu_free(blk->buf);
+ qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+ int64_t remaining_dirty;
+ int64_t avg_transfer_time;
+
+ remaining_dirty = bdrv_get_dirty_count(s->src);
+ if (remaining_dirty == 0 || s->blocks == 0) {
+ return true;
+ }
+
+ avg_transfer_time = s->total_time / s->blocks;
+ if ((remaining_dirty * avg_transfer_time)<= migrate_max_downtime()) {
+ return true;
+ }
+ return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+ int64_t sector;
+
+ for (sector = s->curr_sector; sector< s->nr_sectors;) {
+ if (bdrv_get_dirty(s->src, sector)&& !aio_inflight(s, sector)) {
+ int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+ BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+ blk_copy_issue_read(s, sector, nr_sectors);
+ bdrv_reset_dirty(s->src, sector, nr_sectors);
+ set_aio_inflight(s, sector, nr_sectors, 1);
+ break;
+ }
+
+ sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+ s->curr_sector = sector;
+ }
+
+ if (sector>= s->nr_sectors) {
+ s->curr_sector = 0;
+ }
+ return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+ int nr_sectors;
+ int64_t curr_sector = s->curr_sector;
+
+ if (s->shared_base) {
+ while (curr_sector< s->nr_sectors&&
+ !bdrv_is_allocated(s->src, curr_sector,
+ MAX_IS_ALLOCATED_SEARCH,&nr_sectors)) {
+ curr_sector += nr_sectors;
+ }
+ }
+
+ if (curr_sector>= s->nr_sectors) {
+ s->curr_sector = 0;
+ return 1;
+ }
+
+ curr_sector&= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+ nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+ blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+ s->curr_sector += nr_sectors;
+ s->completed_sectors = curr_sector;
+ return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+ int64_t sector;
+ uint8_t *buf;
+
+ buf = qemu_malloc(BLOCK_SIZE);
+
+ /* FIXME: speed up loop, get_next_dirty_block? */
+ for (sector = 0; sector< s->nr_sectors;
+ sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+ if (bdrv_get_dirty(s->src, sector)) {
+ int nr_sectors = MIN(s->nr_sectors - sector,
+ BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+ memset(buf, 0, BLOCK_SIZE);
+ if (bdrv_read(s->src, sector, buf, nr_sectors)< 0) {
+ goto error;
+ }
+ if (bdrv_write(s->dst, sector, buf, nr_sectors)< 0) {
+ goto error;
+ }
+ bdrv_reset_dirty(s->src, sector, nr_sectors);
+ }
+
+ if (bdrv_get_dirty_count(s->src) == 0)
+ break;
+ }
+ qemu_free(buf);
+ return;
+
+error:
+ qemu_free(buf);
+ s->error = 1;
+}
+
+static int write_commit_file(BdrvCopyState *s)
+{
+ char commit_msg[1400];
+ const char *buf = commit_msg;
+ int len, ret;
+
+ sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n",
s->src_device_name,
+ s->dst_filename);
+
+ len = strlen(commit_msg);
+ while (len> 0) {
+ ret = write(s->commit_fd, buf, len);
+ if (ret == -1&& errno == EINTR) {
+ continue;
+ }
+ if (ret<= 0) {
+ return -errno;
+ }
+ buf += ret;
+ len -= ret;
+ }
+
+ if (fsync(s->commit_fd) == -1) {
+ return -errno;
+ }