On 06.11.2015 11:22, Fam Zheng wrote: > The "pnum < nb_sectors" condition in deciding whether to actually copy > data is unnecessarily strict, and the qiov initialization is > unnecessarily too, for both bdrv_aio_write_zeroes and bdrv_aio_discard > branches. > > Reorganize mirror_iteration flow so that we: > > 1) Find the contiguous zero/discarded sectors with > bdrv_get_block_status_above() before deciding what to do. We query > s->buf_size sized blocks at a time. > > 2) If the sectors in question are zeroed/discarded and aligned to > target cluster, issue zero write or discard accordingly. It's done > in mirror_do_zero_or_discard, where we don't add buffer to qiov. > > 3) Otherwise, do the same loop as before in mirror_do_read. > > Signed-off-by: Fam Zheng <f...@redhat.com> > --- > block/mirror.c | 161 > +++++++++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 127 insertions(+), 34 deletions(-)
Looks good overall, some comments below. Max > diff --git a/block/mirror.c b/block/mirror.c > index b1252a1..ac796b4 100644 > --- a/block/mirror.c > +++ b/block/mirror.c > @@ -157,28 +157,21 @@ static void mirror_read_complete(void *opaque, int ret) > mirror_write_complete, op); > } > > -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) > +static uint64_t mirror_do_read(MirrorBlockJob *s) > { > BlockDriverState *source = s->common.bs; > - int nb_sectors, sectors_per_chunk, nb_chunks; > - int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; > + int sectors_per_chunk, nb_sectors, nb_chunks; > + int64_t end, next_chunk, next_sector, hbitmap_next_sector, sector_num; > uint64_t delay_ns = 0; > MirrorOp *op; > - int pnum; > - int64_t ret; > > - s->sector_num = hbitmap_iter_next(&s->hbi); > - if (s->sector_num < 0) { > - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > - s->sector_num = hbitmap_iter_next(&s->hbi); > - trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); > - assert(s->sector_num >= 0); > - } > + sector_num = s->sector_num; > + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; > + end = s->bdev_length / BDRV_SECTOR_SIZE; > > + next_sector = sector_num; > + next_chunk = sector_num / sectors_per_chunk; @next_sector and @next_chunk set here... > hbitmap_next_sector = s->sector_num; > - sector_num = s->sector_num; > - sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; > - end = s->bdev_length / BDRV_SECTOR_SIZE; > > /* Extend the QEMUIOVector to include all adjacent blocks that will > * be copied in this operation. > @@ -198,14 +191,6 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > next_sector = sector_num; > next_chunk = sector_num / sectors_per_chunk; ...and here already. So the above seems superfluous, considering that they are not read in between. (If you keep hbitmap_next_sector = s->sector_num; above the sector_num = ... block, that may reduce conflicts further) > > - /* Wait for I/O to this cluster (from a previous iteration) to be done. > */ > - while (test_bit(next_chunk, s->in_flight_bitmap)) { > - trace_mirror_yield_in_flight(s, sector_num, s->in_flight); > - s->waiting_for_io = true; > - qemu_coroutine_yield(); > - s->waiting_for_io = false; > - } > - > do { > int added_sectors, added_chunks; > > @@ -301,24 +286,132 @@ static uint64_t coroutine_fn > mirror_iteration(MirrorBlockJob *s) > s->sectors_in_flight += nb_sectors; > trace_mirror_one_iteration(s, sector_num, nb_sectors); > > - ret = bdrv_get_block_status_above(source, NULL, sector_num, > - nb_sectors, &pnum); > - if (ret < 0 || pnum < nb_sectors || > - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { > - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > - mirror_read_complete, op); > - } else if (ret & BDRV_BLOCK_ZERO) { > + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, > + mirror_read_complete, op); > + return delay_ns; > +} > + > +static uint64_t mirror_do_zero_or_discard(MirrorBlockJob *s, > + int64_t sector_num, > + int nb_sectors, > + bool is_discard) > +{ > + int sectors_per_chunk, nb_chunks; > + int64_t next_chunk, next_sector, hbitmap_next_sector; > + uint64_t delay_ns = 0; > + MirrorOp *op; > + > + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; > + assert(nb_sectors >= sectors_per_chunk); > + next_chunk = sector_num / sectors_per_chunk; > + nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk); > + bitmap_set(s->in_flight_bitmap, next_chunk, nb_chunks); > + delay_ns = ratelimit_calculate_delay(&s->limit, nb_sectors); > + > + /* Allocate a MirrorOp that is used as an AIO callback. The qiov is > zeroed > + * out so the freeing in iteration is nop. */ > + op = g_new0(MirrorOp, 1); > + op->s = s; > + op->sector_num = sector_num; > + op->nb_sectors = nb_sectors; > + > + /* Advance the HBitmapIter in parallel, so that we do not examine > + * the same sector twice. > + */ > + hbitmap_next_sector = sector_num; > + next_sector = sector_num + nb_sectors; > + while (next_sector > hbitmap_next_sector) { > + hbitmap_next_sector = hbitmap_iter_next(&s->hbi); > + if (hbitmap_next_sector < 0) { > + break; > + } > + } > + > + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); > + s->in_flight++; > + s->sectors_in_flight += nb_sectors; > + if (is_discard) { > + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > + mirror_write_complete, op); > + } else { > bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, > s->unmap ? BDRV_REQ_MAY_UNMAP : 0, > mirror_write_complete, op); > - } else { > - assert(!(ret & BDRV_BLOCK_DATA)); > - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, > - mirror_write_complete, op); > } > + > return delay_ns; > } > > +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) > +{ > + BlockDriverState *source = s->common.bs; > + int sectors_per_chunk; > + int64_t sector_num, next_chunk; > + int ret; > + int contiguous_sectors = s->buf_size >> BDRV_SECTOR_BITS; > + enum MirrorMethod { > + MIRROR_METHOD_COPY, > + MIRROR_METHOD_ZERO, > + MIRROR_METHOD_DISCARD > + } mirror_method; > + > + s->sector_num = hbitmap_iter_next(&s->hbi); > + if (s->sector_num < 0) { > + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); > + s->sector_num = hbitmap_iter_next(&s->hbi); > + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); > + assert(s->sector_num >= 0); > + } > + > + sector_num = s->sector_num; > + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; > + next_chunk = sector_num / sectors_per_chunk; > + > + /* Wait for I/O to this cluster (from a previous iteration) to be done. > */ > + while (test_bit(next_chunk, s->in_flight_bitmap)) { > + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); > + s->waiting_for_io = true; > + qemu_coroutine_yield(); > + s->waiting_for_io = false; > + } > + > + mirror_method = MIRROR_METHOD_COPY; > + ret = bdrv_get_block_status_above(source, NULL, sector_num, > + contiguous_sectors, > + &contiguous_sectors); We did have NULL as the base before, but is that correct? Shouldn't we use backing_bs(source) or s->base if @sync is none or top? Examining this is fun, the short answer is probably "NULL is wrong, but it's the best we can do here". (1) with NULL here: $ ./qemu-img create -f qcow2 -b base.qcow2 top.qcow2 $ ./qemu-io -c 'write -P 42 0 64k' base.qcow2 $ ./qemu-img create -f qcow2 -o compat=0.10 -b base.qcow2 top.qcow2 # compat=0.10 is important because otherwise discard will create a # zero cluster instead of truly discarding $ (echo "{'execute':'qmp_capabilities'} {'execute':'drive-mirror','arguments': {'device':'drive0','target':'out.qcow2', 'format':'qcow2','sync':'top'}} {'execute':'human-monitor-command','arguments': {'command-line':'qemu-io drive0 \"discard 0 64k\"'}} {'execute':'block-job-complete','arguments': {'device':'drive0'}} {'execute':'quit'}") | \ x86_64-softmmu/qemu-system-x86_64 -qmp stdio \ -drive if=none,file=top.qcow2,id=drive0,discard=unmap {"QMP": {"version": {"qemu": {"micro": 50, "minor": 4, "major": 2}, "package": ""}, "capabilities": []}} {"return": {}} Formatting 'out.qcow2', fmt=qcow2 size=67108864 backing_file=base.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16 {"return": {}} discard 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (3.815 GiB/sec and 62500.0000 ops/sec) {"return": ""} {"timestamp": {"seconds": 1446830775, "microseconds": 198114}, "event": "BLOCK_JOB_READY", "data": {"device": "drive0", "len": 0, "offset": 0, "speed": 0, "type": "mirror"}} {"return": {}} {"return": {}} {"timestamp": {"seconds": 1446830775, "microseconds": 198371}, "event": "SHUTDOWN"} {"timestamp": {"seconds": 1446830775, "microseconds": 205506}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "drive0", "len": 65536, "offset": 65536, "speed": 0, "type": "mirror"}} (Note that the discard must have been executed before BLOCK_JOB_READY, it's a bit racy) Now we check: $ ./qemu-io -c 'read -P 0 0 64k' out.qcow2 Pattern verification failed at offset 0, 65536 bytes read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec) $ ./qemu-io -c 'read -P 42 0 64k' out.qcow2 read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (868.056 MiB/sec and 13888.8889 ops/sec) Well, that seems reasonable, considering that out.qcow2 has base.qcow2 as the backing file, but: $ ./qemu-img map out.qcow2 Offset Length Mapped to File 0 0x10000 0x50000 out.qcow2 Hm... Well, at least out.qcow2 and top.qcow2 return the same values when read, but we don't need to copy that from the backing file. Just leaving the cluster unallocated would have been enough. (2) So with s->base instead of NULL we get: $ ./qemu-io -c 'read -P 0 0 64k' out.qcow2 read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (856.164 MiB/sec and 13698.6301 ops/sec) $ ./qemu-io -c 'read -P 42 0 64k' out.qcow2 Pattern verification failed at offset 0, 65536 bytes read 65536/65536 bytes at offset 0 64 KiB, 1 ops; 0.0000 sec (892.857 MiB/sec and 14285.7143 ops/sec) And: $ ./qemu-img map out.qcow2 Offset Length Mapped to File So the map output looks right, but now we don't read the backing file anymore. That's because for qcow2 v3, qemu writes zero clusters when asked to discard (that's why I had to use compat=0.10 for top.qcow2). Using a compat=0.10 image with @mode = existing fixes that. But well... So the issue is: We don't have a reliable function to punch a hole into an overlay file. discard could do that (and does so for qcow2 v2), but we considered returning zeros the better option if possible because people might find that more reasonable. So while it would be best to copy holes in the overlay file created by discards to the mirrored file, there is no way for us to do that. Therefore, the best we can do is copy from the backing file. So, OK, NULL it is. > + > + contiguous_sectors -= contiguous_sectors % sectors_per_chunk; > + if (ret < 0 || contiguous_sectors < sectors_per_chunk) { > + contiguous_sectors = sectors_per_chunk; > + } else if (!(ret & BDRV_BLOCK_DATA)) { > + int64_t target_sector_num; > + int target_nb_sectors; > + bdrv_round_to_clusters(s->target, sector_num, contiguous_sectors, > + &target_sector_num, &target_nb_sectors); > + if (target_sector_num == sector_num && > + target_nb_sectors == contiguous_sectors) { > + mirror_method = ret & BDRV_BLOCK_ZERO ? > + MIRROR_METHOD_ZERO : > + MIRROR_METHOD_DISCARD; > + } > + } > + > + switch (mirror_method) { > + case MIRROR_METHOD_COPY: > + return mirror_do_read(s); > + case MIRROR_METHOD_ZERO: > + return mirror_do_zero_or_discard(s, sector_num, > + contiguous_sectors, > + false); > + case MIRROR_METHOD_DISCARD: > + return mirror_do_zero_or_discard(s, sector_num, > + contiguous_sectors, > + false); s/false/true/? > + default: > + abort(); > + } > +} > + > static void mirror_free_init(MirrorBlockJob *s) > { > int granularity = s->granularity; >
signature.asc
Description: OpenPGP digital signature