On Mon, 07/22 08:34, Paolo Bonzini wrote: > Il 22/07/2013 05:46, Fam Zheng ha scritto: > > This patch eliminates limitation of committing the active device. > > > > bdrv_drop_intermediate is reimplemented to take pointers to > > (BlockDriverState *), so it can modify the caller's local pointers to > > preserve their semantics, while updating active BDS in-place by > > bdrv_swap active and base: we need data in 'base' as it's the only > > remaining after commit, but we can't delete 'active' as it's referenced > > everywhere in the program. > > > > Guest writes to active device during the commit are tracked by dirty map > > and committed like block-mirror. > > I have only skimmed the patch, but I think this is incomplete. > Management needs to know the moment when 'active' is not valid anymore, > thus this job needs to be completed manually with "block-job-complete".
Does management need access to 'active' image outside of QEMU process? Although original 'active' it is "dropped" by bdrv_drop_intermediate, the pointers to original 'active' is still valid because 'base' is moved to this address (with bdrv_swap). I don't know, what is the problem here for management? > > In fact, I wonder if block/commit.c could reuse most of the code from > block/mirror.c (basically everything except that bdrv_swap should be > replaced by bdrv_drop_intermediate). > Hmm, yes, in this case, it is quite similar to mirroring 'active' to 'base' with sync mode top. > > Signed-off-by: Fam Zheng <f...@redhat.com> > > --- > > block.c | 102 ++++++++++---------------------- > > block/commit.c | 160 > > ++++++++++++++++++++++++++------------------------ > > include/block/block.h | 5 +- > > 3 files changed, 115 insertions(+), 152 deletions(-) > > > > diff --git a/block.c b/block.c > > index b560241..367e064 100644 > > --- a/block.c > > +++ b/block.c > > @@ -2018,18 +2018,11 @@ BlockDriverState > > *bdrv_find_overlay(BlockDriverState *active, > > return overlay; > > } > > > > -typedef struct BlkIntermediateStates { > > - BlockDriverState *bs; > > - QSIMPLEQ_ENTRY(BlkIntermediateStates) entry; > > -} BlkIntermediateStates; > > - > > - > > /* > > - * Drops images above 'base' up to and including 'top', and sets the image > > - * above 'top' to have base as its backing file. > > - * > > - * Requires that the overlay to 'top' is opened r/w, so that the backing > > file > > - * information in 'bs' can be properly updated. > > + * Drops images above '*base' up to and including '*top', and sets new > > '*base' > > + * as backing_hd of top_overlay (the image orignally has 'top' as backing > > + * file). top_overlay may be NULL if '*top' is active, no such update > > needed. > > + * Requires that the top_overlay to 'top' is opened r/w. > > * > > * E.g., this will convert the following chain: > > * bottom <- base <- intermediate <- top <- active > > @@ -2046,82 +2039,47 @@ typedef struct BlkIntermediateStates { > > * > > * base <- active > > * > > - * Error conditions: > > - * if active == top, that is considered an error > > + * It also allows active==top, in which case it converts: > > + * > > + * base <- intermediate <- active (also top) > > + * > > + * to > > + * > > + * base == active == top, i.e. only base remains: *top == *base when > > return. > > * > > */ > > -int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, > > - BlockDriverState *base) > > +int bdrv_drop_intermediate(BlockDriverState *top_overlay, > > + BlockDriverState **top, > > + BlockDriverState **base) > > { > > - BlockDriverState *intermediate; > > + BlockDriverState *pbs; > > BlockDriverState *base_bs = NULL; > > - BlockDriverState *new_top_bs = NULL; > > - BlkIntermediateStates *intermediate_state, *next; > > int ret = -EIO; > > > > - QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) > > states_to_delete; > > - QSIMPLEQ_INIT(&states_to_delete); > > - > > - if (!top->drv || !base->drv) { > > + if (!(*top)->drv || !(*base)->drv) { > > goto exit; > > } > > > > - new_top_bs = bdrv_find_overlay(active, top); > > - > > - if (new_top_bs == NULL) { > > - /* we could not find the image above 'top', this is an error */ > > - goto exit; > > + for (pbs = (*top)->backing_hd; pbs != *base; pbs = base_bs) { > > + assert(pbs); > > + base_bs = pbs->backing_hd; > > + pbs->backing_hd = NULL; > > + bdrv_delete(pbs); > > } > > > > - /* special case of new_top_bs->backing_hd already pointing to base - > > nothing > > - * to do, no intermediate images */ > > - if (new_top_bs->backing_hd == base) { > > - ret = 0; > > - goto exit; > > - } > > + bdrv_swap(*base, *top); > > > > - intermediate = top; > > + (*base)->backing_hd = NULL; > > + bdrv_delete(*base); > > + *base = *top; > > > > - /* now we will go down through the list, and add each BDS we find > > - * into our deletion queue, until we hit the 'base' > > - */ > > - while (intermediate) { > > - intermediate_state = g_malloc0(sizeof(BlkIntermediateStates)); > > - intermediate_state->bs = intermediate; > > - QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry); > > - > > - if (intermediate->backing_hd == base) { > > - base_bs = intermediate->backing_hd; > > - break; > > - } > > - intermediate = intermediate->backing_hd; > > - } > > - if (base_bs == NULL) { > > - /* something went wrong, we did not end at the base. safely > > - * unravel everything, and exit with error */ > > - goto exit; > > - } > > - > > - /* success - we can delete the intermediate states, and link top->base > > */ > > - ret = bdrv_change_backing_file(new_top_bs, base_bs->filename, > > - base_bs->drv ? > > base_bs->drv->format_name : ""); > > - if (ret) { > > - goto exit; > > - } > > - new_top_bs->backing_hd = base_bs; > > - > > - > > - QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, > > next) { > > - /* so that bdrv_close() does not recursively close the chain */ > > - intermediate_state->bs->backing_hd = NULL; > > - bdrv_delete(intermediate_state->bs); > > + /* overlay exists when active != top, need to change backing file for > > it */ > > + if (top_overlay) { > > + ret = bdrv_change_backing_file(top_overlay, (*base)->filename, > > + (*base)->drv ? > > + (*base)->drv->format_name : > > ""); > > } > > - ret = 0; > > - > > exit: > > - QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, > > next) { > > - g_free(intermediate_state); > > - } > > return ret; > > } > > > > diff --git a/block/commit.c b/block/commit.c > > index 2227fc2..c85b188 100644 > > --- a/block/commit.c > > +++ b/block/commit.c > > @@ -17,14 +17,13 @@ > > #include "block/blockjob.h" > > #include "qemu/ratelimit.h" > > > > -enum { > > - /* > > - * Size of data buffer for populating the image file. This should be > > large > > - * enough to process multiple clusters in a single call, so that > > populating > > - * contiguous regions of the image is efficient. > > - */ > > - COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ > > -}; > > +/* > > + * Size of data buffer for populating the image file. This should be large > > + * enough to process multiple clusters in a single call, so that populating > > + * contiguous regions of the image is efficient. > > + */ > > +#define COMMIT_BUFFER_SECTORS 128 > > +#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE) > > > > #define SLICE_TIME 100000000ULL /* ns */ > > > > @@ -34,6 +33,7 @@ typedef struct CommitBlockJob { > > BlockDriverState *active; > > BlockDriverState *top; > > BlockDriverState *base; > > + BlockDriverState *overlay; > > BlockdevOnError on_error; > > int base_flags; > > int orig_overlay_flags; > > @@ -65,100 +65,109 @@ static void coroutine_fn commit_run(void *opaque) > > BlockDriverState *active = s->active; > > BlockDriverState *top = s->top; > > BlockDriverState *base = s->base; > > - BlockDriverState *overlay_bs; > > int64_t sector_num, end; > > int ret = 0; > > int n = 0; > > void *buf; > > - int bytes_written = 0; > > int64_t base_len; > > + int64_t next_dirty; > > + HBitmapIter hbi; > > > > + buf = qemu_blockalign(top, COMMIT_BUFFER_BYTES); > > ret = s->common.len = bdrv_getlength(top); > > > > - > > if (s->common.len < 0) { > > - goto exit_restore_reopen; > > + goto exit; > > } > > > > ret = base_len = bdrv_getlength(base); > > if (base_len < 0) { > > - goto exit_restore_reopen; > > + goto exit; > > } > > > > if (base_len < s->common.len) { > > ret = bdrv_truncate(base, s->common.len); > > if (ret) { > > - goto exit_restore_reopen; > > + goto exit; > > } > > } > > > > end = s->common.len >> BDRV_SECTOR_BITS; > > - buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); > > > > for (sector_num = 0; sector_num < end; sector_num += n) { > > - uint64_t delay_ns = 0; > > - bool copy; > > > > -wait: > > - /* Note that even when no rate limit is applied we need to yield > > - * with no pending I/O here so that bdrv_drain_all() returns. > > - */ > > - block_job_sleep_ns(&s->common, rt_clock, delay_ns); > > - if (block_job_is_cancelled(&s->common)) { > > - break; > > - } > > /* Copy if allocated above the base */ > > ret = bdrv_co_is_allocated_above(top, base, sector_num, > > - COMMIT_BUFFER_SIZE / > > BDRV_SECTOR_SIZE, > > + COMMIT_BUFFER_SECTORS, > > &n); > > - copy = (ret == 1); > > - trace_commit_one_iteration(s, sector_num, n, ret); > > - if (copy) { > > - if (s->common.speed) { > > - delay_ns = ratelimit_calculate_delay(&s->limit, n); > > - if (delay_ns > 0) { > > - goto wait; > > - } > > - } > > - ret = commit_populate(top, base, sector_num, n, buf); > > - bytes_written += n * BDRV_SECTOR_SIZE; > > + if (ret) { > > + bdrv_set_dirty(top, sector_num, n); > > + } > > + } > > + > > + while (bdrv_get_dirty_count(s->top)) { > > + uint64_t delay_ns = 0; > > + if (block_job_is_cancelled(&s->common)) { > > + goto exit; > > } > > - if (ret < 0) { > > - if (s->on_error == BLOCKDEV_ON_ERROR_STOP || > > - s->on_error == BLOCKDEV_ON_ERROR_REPORT|| > > - (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == > > -ENOSPC)) { > > - goto exit_free_buf; > > - } else { > > - n = 0; > > - continue; > > + > > + bdrv_dirty_iter_init(s->top, &hbi); > > + for (next_dirty = hbitmap_iter_next(&hbi); > > + next_dirty >= 0; > > + next_dirty = hbitmap_iter_next(&hbi)) { > > + sector_num = next_dirty; > > + if (block_job_is_cancelled(&s->common)) { > > + goto exit; > > } > > + delay_ns = ratelimit_calculate_delay(&s->limit, > > + COMMIT_BUFFER_SECTORS); > > + /* Note that even when no rate limit is applied we need to > > yield > > + * with no pending I/O here so that bdrv_drain_all() returns. > > + */ > > + block_job_sleep_ns(&s->common, rt_clock, delay_ns); > > + trace_commit_one_iteration(s, sector_num, > > + COMMIT_BUFFER_SECTORS, ret); > > + ret = commit_populate(top, base, sector_num, > > + COMMIT_BUFFER_SECTORS, buf); > > + if (ret < 0) { > > + if (s->on_error == BLOCKDEV_ON_ERROR_STOP || > > + s->on_error == BLOCKDEV_ON_ERROR_REPORT || > > + (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && > > + ret == -ENOSPC)) { > > + goto exit; > > + } else { > > + continue; > > + } > > + } > > + /* Publish progress */ > > + s->common.offset += COMMIT_BUFFER_BYTES; > > + bdrv_reset_dirty(top, sector_num, COMMIT_BUFFER_SECTORS); > > } > > - /* Publish progress */ > > - s->common.offset += n * BDRV_SECTOR_SIZE; > > } > > > > - ret = 0; > > - > > - if (!block_job_is_cancelled(&s->common) && sector_num == end) { > > - /* success */ > > - ret = bdrv_drop_intermediate(active, top, base); > > + if (!block_job_is_cancelled(&s->common)) { > > + /* Drop intermediate: [top, base) */ > > + ret = bdrv_drop_intermediate(s->overlay, &top, &base); > > + s->common.offset = s->common.len; > > } > > > > -exit_free_buf: > > - qemu_vfree(buf); > > + ret = 0; > > + > > +exit: > > + bdrv_set_dirty_tracking(active, 0); > > > > -exit_restore_reopen: > > /* restore base open flags here if appropriate (e.g., change the base > > back > > * to r/o). These reopens do not need to be atomic, since we won't > > abort > > * even on failure here */ > > - if (s->base_flags != bdrv_get_flags(base)) { > > + if (s->overlay && s->base_flags != bdrv_get_flags(base)) { > > bdrv_reopen(base, s->base_flags, NULL); > > } > > - overlay_bs = bdrv_find_overlay(active, top); > > - if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) > > { > > - bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); > > + > > + if (s->overlay && s->orig_overlay_flags != bdrv_get_flags(s->overlay)) > > { > > + bdrv_reopen(s->overlay, s->orig_overlay_flags, NULL); > > } > > > > + qemu_vfree(buf); > > block_job_completed(&s->common, ret); > > } > > > > @@ -198,13 +207,6 @@ void commit_start(BlockDriverState *bs, > > BlockDriverState *base, > > return; > > } > > > > - /* Once we support top == active layer, remove this check */ > > - if (top == bs) { > > - error_setg(errp, > > - "Top image as the active layer is currently > > unsupported"); > > - return; > > - } > > - > > if (top == base) { > > error_setg(errp, "Invalid files for merge: top and base are the > > same"); > > return; > > @@ -212,23 +214,20 @@ void commit_start(BlockDriverState *bs, > > BlockDriverState *base, > > > > overlay_bs = bdrv_find_overlay(bs, top); > > > > - if (overlay_bs == NULL) { > > - error_setg(errp, "Could not find overlay image for %s:", > > top->filename); > > - return; > > - } > > - > > orig_base_flags = bdrv_get_flags(base); > > - orig_overlay_flags = bdrv_get_flags(overlay_bs); > > + if (overlay_bs) { > > + orig_overlay_flags = bdrv_get_flags(overlay_bs); > > + if (!(orig_overlay_flags & BDRV_O_RDWR)) { > > + reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, > > + orig_overlay_flags | BDRV_O_RDWR); > > + } > > + } > > > > /* convert base & overlay_bs to r/w, if necessary */ > > if (!(orig_base_flags & BDRV_O_RDWR)) { > > reopen_queue = bdrv_reopen_queue(reopen_queue, base, > > orig_base_flags | BDRV_O_RDWR); > > } > > - if (!(orig_overlay_flags & BDRV_O_RDWR)) { > > - reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, > > - orig_overlay_flags | BDRV_O_RDWR); > > - } > > if (reopen_queue) { > > bdrv_reopen_multiple(reopen_queue, &local_err); > > if (local_err != NULL) { > > @@ -237,7 +236,6 @@ void commit_start(BlockDriverState *bs, > > BlockDriverState *base, > > } > > } > > > > - > > s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp); > > if (!s) { > > return; > > @@ -246,13 +244,19 @@ void commit_start(BlockDriverState *bs, > > BlockDriverState *base, > > s->base = base; > > s->top = top; > > s->active = bs; > > + s->overlay = overlay_bs; > > > > s->base_flags = orig_base_flags; > > - s->orig_overlay_flags = orig_overlay_flags; > > + if (overlay_bs) { > > + s->orig_overlay_flags = orig_overlay_flags; > > + } > > > > s->on_error = on_error; > > s->common.co = qemu_coroutine_create(commit_run); > > > > trace_commit_start(bs, base, top, s, s->common.co, opaque); > > + > > + bdrv_set_dirty_tracking(top, COMMIT_BUFFER_BYTES); > > + > > qemu_coroutine_enter(s->common.co, s); > > } > > diff --git a/include/block/block.h b/include/block/block.h > > index b6b9014..caf2c22 100644 > > --- a/include/block/block.h > > +++ b/include/block/block.h > > @@ -197,8 +197,9 @@ int bdrv_commit_all(void); > > int bdrv_change_backing_file(BlockDriverState *bs, > > const char *backing_file, const char *backing_fmt); > > void bdrv_register(BlockDriver *bdrv); > > -int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top, > > - BlockDriverState *base); > > +int bdrv_drop_intermediate(BlockDriverState *top_overlay, > > + BlockDriverState **top, > > + BlockDriverState **base); > > BlockDriverState *bdrv_find_overlay(BlockDriverState *active, > > BlockDriverState *bs); > > BlockDriverState *bdrv_find_base(BlockDriverState *bs); > > > -- Fam