On Tue, 28 Feb 2012 15:54:06 -0500 Jeff Cody <jc...@redhat.com> wrote:
> This is a QAPI/QMP only command to take a snapshot of a group of > devices. This is similar to the blockdev-snapshot-sync command, except > blockdev-group-snapshot-sync accepts a list devices, filenames, and > formats. > > It is attempted to keep the snapshot of the group atomic; if the > creation or open of any of the new snapshots fails, then all of > the new snapshots are abandoned, and the name of the snapshot image > that failed is returned. The failure case should not interrupt > any operations. > > Rather than use bdrv_close() along with a subsequent bdrv_open() to > perform the pivot, the original image is never closed and the new > image is placed 'in front' of the original image via manipulation > of the BlockDriverState fields. Thus, once the new snapshot image > has been successfully created, there are no more failure points > before pivoting to the new snapshot. > > This allows the group of disks to remain consistent with each other, > even across snapshot failures. > > Signed-off-by: Jeff Cody <jc...@redhat.com> This looks fine to me, I have some small remarks that can be addressed by a follow up patch or ignored :) > --- > block.c | 81 +++++++++++++++++++++++++++++++++ > block.h | 1 + > block_int.h | 6 +++ > blockdev.c | 131 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > qapi-schema.json | 38 ++++++++++++++++ > 5 files changed, 257 insertions(+), 0 deletions(-) > > diff --git a/block.c b/block.c > index 3621d11..90232d9 100644 > --- a/block.c > +++ b/block.c > @@ -880,6 +880,87 @@ void bdrv_make_anon(BlockDriverState *bs) > bs->device_name[0] = '\0'; > } > > +/* > + * Add new bs contents at the top of an image chain while the chain is > + * live, while keeping required fields on the top layer. > + * > + * This will modify the BlockDriverState fields, and swap contents > + * between bs_new and bs_top. Both bs_new and bs_top are modified. > + * > + * This function does not create any image files. > + */ > +void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top) > +{ > + BlockDriverState tmp; > + > + /* the new bs must not be in bdrv_states */ > + bdrv_make_anon(bs_new); That should be the case already no? If so, then an assert() is better. > + > + tmp = *bs_new; > + > + /* there are some fields that need to stay on the top layer: */ > + > + /* dev info */ > + tmp.dev_ops = bs_top->dev_ops; > + tmp.dev_opaque = bs_top->dev_opaque; > + tmp.dev = bs_top->dev; > + tmp.buffer_alignment = bs_top->buffer_alignment; > + tmp.copy_on_read = bs_top->copy_on_read; > + > + /* i/o timing parameters */ > + tmp.slice_time = bs_top->slice_time; > + tmp.slice_start = bs_top->slice_start; > + tmp.slice_end = bs_top->slice_end; > + tmp.io_limits = bs_top->io_limits; > + tmp.io_base = bs_top->io_base; > + tmp.throttled_reqs = bs_top->throttled_reqs; > + tmp.block_timer = bs_top->block_timer; > + tmp.io_limits_enabled = bs_top->io_limits_enabled; > + > + /* geometry */ > + tmp.cyls = bs_top->cyls; > + tmp.heads = bs_top->heads; > + tmp.secs = bs_top->secs; > + tmp.translation = bs_top->translation; > + > + /* r/w error */ > + tmp.on_read_error = bs_top->on_read_error; > + tmp.on_write_error = bs_top->on_write_error; > + > + /* i/o status */ > + tmp.iostatus_enabled = bs_top->iostatus_enabled; > + tmp.iostatus = bs_top->iostatus; > + > + /* keep the same entry in bdrv_states */ > + pstrcpy(tmp.device_name, sizeof(tmp.device_name), bs_top->device_name); > + tmp.list = bs_top->list; > + > + /* The contents of 'tmp' will become bs_top, as we are > + * swapping bs_new and bs_top contents. */ > + tmp.backing_hd = bs_new; > + pstrcpy(tmp.backing_file, sizeof(tmp.backing_file), bs_top->filename); > + > + /* swap contents of the fixed new bs and the current top */ > + *bs_new = *bs_top; > + *bs_top = tmp; > + > + /* clear the copied fields in the new backing file */ > + bdrv_detach_dev(bs_new, bs_new->dev); > + > + qemu_co_queue_init(&bs_new->throttled_reqs); > + memset(&bs_new->io_base, 0, sizeof(bs_new->io_base)); > + memset(&bs_new->io_limits, 0, sizeof(bs_new->io_limits)); > + bdrv_iostatus_disable(bs_new); > + > + /* we don't use bdrv_io_limits_disable() for this, because we don't want > + * to affect or delete the block_timer, as it has been moved to bs_top */ > + bs_new->io_limits_enabled = false; > + bs_new->block_timer = NULL; > + bs_new->slice_time = 0; > + bs_new->slice_start = 0; > + bs_new->slice_end = 0; > +} > + > void bdrv_delete(BlockDriverState *bs) > { > assert(!bs->dev); > diff --git a/block.h b/block.h > index cae289b..190a780 100644 > --- a/block.h > +++ b/block.h > @@ -114,6 +114,7 @@ int bdrv_create(BlockDriver *drv, const char* filename, > int bdrv_create_file(const char* filename, QEMUOptionParameter *options); > BlockDriverState *bdrv_new(const char *device_name); > void bdrv_make_anon(BlockDriverState *bs); > +void bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top); > void bdrv_delete(BlockDriverState *bs); > int bdrv_parse_cache_flags(const char *mode, int *flags); > int bdrv_file_open(BlockDriverState **pbs, const char *filename, int flags); > diff --git a/block_int.h b/block_int.h > index 7be2988..5edc8c1 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -219,6 +219,12 @@ struct BlockDriver { > QLIST_ENTRY(BlockDriver) list; > }; > > +/* > + * Note: the function bdrv_append() copies and swaps contents of > + * BlockDriverStates, so if you add new fields to this struct, please > + * inspect bdrv_append() to determine if the new fields need to be > + * copied as well. > + */ > struct BlockDriverState { > int64_t total_sectors; /* if we are reading a disk image, give its > size in sectors */ > diff --git a/blockdev.c b/blockdev.c > index ab9fd21..4be9947 100644 > --- a/blockdev.c > +++ b/blockdev.c > @@ -719,6 +719,137 @@ void qmp_blockdev_snapshot_sync(const char *device, > const char *snapshot_file, > } > } > > + > +/* New and old BlockDriverState structs for group snapshots */ > +typedef struct BlkGroupSnapshotStates { > + BlockDriverState *old_bs; > + BlockDriverState *new_bs; > + QSIMPLEQ_ENTRY(BlkGroupSnapshotStates) entry; > +} BlkGroupSnapshotStates; > + > +/* > + * 'Atomic' group snapshots. The snapshots are taken as a set, and if any > fail > + * then we do not pivot any of the devices in the group, and abandon the > + * snapshots > + */ > +void qmp_blockdev_group_snapshot_sync(SnapshotDevList *dev_list, > + Error **errp) > +{ > + int ret = 0; > + SnapshotDevList *dev_entry = dev_list; > + SnapshotDev *dev_info = NULL; > + BlkGroupSnapshotStates *states; > + BlockDriver *proto_drv; > + BlockDriver *drv; > + int flags; > + const char *format; > + const char *snapshot_file; > + > + QSIMPLEQ_HEAD(snap_bdrv_states, BlkGroupSnapshotStates) snap_bdrv_states; > + QSIMPLEQ_INIT(&snap_bdrv_states); > + > + /* drain all i/o before any snapshots */ > + bdrv_drain_all(); > + > + /* We don't do anything in this loop that commits us to the snapshot */ > + while (NULL != dev_entry) { > + dev_info = dev_entry->value; > + dev_entry = dev_entry->next; > + > + states = g_malloc0(sizeof(BlkGroupSnapshotStates)); > + QSIMPLEQ_INSERT_TAIL(&snap_bdrv_states, states, entry); > + > + states->old_bs = bdrv_find(dev_info->device); > + > + if (!states->old_bs) { > + error_set(errp, QERR_DEVICE_NOT_FOUND, dev_info->device); > + goto delete_and_fail; > + } > + > + if (bdrv_in_use(states->old_bs)) { > + error_set(errp, QERR_DEVICE_IN_USE, dev_info->device); > + goto delete_and_fail; > + } > + > + if (!bdrv_is_read_only(states->old_bs) && > + bdrv_is_inserted(states->old_bs)) { > + > + if (bdrv_flush(states->old_bs)) { > + error_set(errp, QERR_IO_ERROR); > + goto delete_and_fail; > + } > + } > + > + snapshot_file = dev_info->snapshot_file; > + > + flags = states->old_bs->open_flags; > + > + if (!dev_info->has_format) { > + format = "qcow2"; > + } else { > + format = dev_info->format; > + } I think it would have been better to default to the format of the imagine being "snapshotted", but maybe it's even better to be compatible with the original snapshot-sync command... > + > + drv = bdrv_find_format(format); > + if (!drv) { > + error_set(errp, QERR_INVALID_BLOCK_FORMAT, format); > + goto delete_and_fail; > + } > + > + proto_drv = bdrv_find_protocol(snapshot_file); > + if (!proto_drv) { > + error_set(errp, QERR_INVALID_BLOCK_FORMAT, format); > + goto delete_and_fail; > + } > + > + /* create new image w/backing file */ > + ret = bdrv_img_create(snapshot_file, format, > + states->old_bs->filename, > + drv->format_name, NULL, -1, flags); > + if (ret) { > + error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file); > + goto delete_and_fail; > + } > + > + /* We will manually add the backing_hd field to the bs later */ > + states->new_bs = bdrv_new(""); > + ret = bdrv_open(states->new_bs, snapshot_file, > + flags | BDRV_O_NO_BACKING, drv); > + if (ret != 0) { > + error_set(errp, QERR_OPEN_FILE_FAILED, snapshot_file); > + goto delete_and_fail; > + } > + } > + > + > + /* Now we are going to do the actual pivot. Everything up to this point > + * is reversible, but we are committed at this point */ > + QSIMPLEQ_FOREACH(states, &snap_bdrv_states, entry) { > + /* This removes our old bs from the bdrv_states, and adds the new bs > */ > + bdrv_append(states->new_bs, states->old_bs); > + } > + > + /* success */ > + goto exit; > + > +delete_and_fail: > + /* > + * failure, and it is all-or-none; abandon each new bs, and keep using > + * the original bs for all images > + */ > + QSIMPLEQ_FOREACH(states, &snap_bdrv_states, entry) { > + if (states->new_bs) { > + bdrv_delete(states->new_bs); Isn't it a good idea to remove any created image files? > + } > + } > +exit: > + QSIMPLEQ_FOREACH(states, &snap_bdrv_states, entry) { > + g_free(states); > + } > + return; > +} > + > + > static void eject_device(BlockDriverState *bs, int force, Error **errp) > { > if (bdrv_in_use(bs)) { > diff --git a/qapi-schema.json b/qapi-schema.json > index d02ee86..165e22e 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -1107,6 +1107,44 @@ > { 'command': 'block_resize', 'data': { 'device': 'str', 'size': 'int' }} > > ## > +# @SnapshotDev > +# > +# @device: the name of the device to generate the snapshot from. > +# > +# @snapshot-file: the target of the new image. A new file will be created. > +# > +# @format: #optional the format of the snapshot image, default is 'qcow2'. > +## > +{ 'type': 'SnapshotDev', > + 'data': {'device': 'str', 'snapshot-file': 'str', '*format': 'str' } } > + > +## > +# @blockdev-group-snapshot-sync > +# > +# Generates a synchronous snapshot of a group of one or more block devices, > +# as atomically as possible. If the snapshot of any device in the group > +# fails, then the entire group snapshot will be abandoned and the > +# appropriate error returned. > +# > +# List of: > +# @SnapshotDev: information needed for the device snapshot > +# > +# Returns: nothing on success > +# If @device is not a valid block device, DeviceNotFound > +# If @device is busy, DeviceInUse will be returned > +# If @snapshot-file can't be created, OpenFileFailed > +# If @snapshot-file can't be opened, OpenFileFailed > +# If @format is invalid, InvalidBlockFormat > +# > +# Note: The group snapshot attempt returns failure on the first snapshot > +# device failure. Therefore, there will be only one device or snapshot file > +# returned in an error condition, and subsequent devices will not have been > +# attempted. > +## > +{ 'command': 'blockdev-group-snapshot-sync', > + 'data': { 'devlist': [ 'SnapshotDev' ] } } > + > +## > # @blockdev-snapshot-sync > # > # Generates a synchronous snapshot of a block device.