Il 27/07/2012 15:58, Anthony Liguori ha scritto: >> Note that if you want to test this feature, you must use a qcow2 >> version 3 file. To create that, do: >> >> qemu-img create -f qcow2 -o compat=1.1 <name> <size> >> >> Ordinary qcow2 (v2) and raw do NOT know how to treat zero blocks >> specially, so you won't notice any difference. > > It would be a lot more useful if this was implemented as a dynamic > option. Offline sparsification is one use-case, but online > sparsification is another. > > This can be orchestrated through the qemu-ga binary. First, you enable > zero detection, then you ask qemu-ga to create a file filled with zeros > of 90% of the available free space unlinking the file as soon as you > open it. You then close the fd after you've written out the zeros. > Then you disable zero detection and keep going.
Or just finish up discard support and use the existing fstrim command of qemu-ga. :) Paolo > > This would let virt-manager or oVirt implement a "compact disk" option > against running VMs. > > I don't think it's much of a change to your existing patch either. > > Regards, > > Anthony Liguori > >> >> Signed-off-by: Richard W.M. Jones <rjo...@redhat.com> >> --- >> block.c | 33 ++++++++++++++++++++++++++++++++- >> block.h | 1 + >> block_int.h | 1 + >> blockdev.c | 10 ++++++++++ >> hmp-commands.hx | 3 ++- >> qemu-config.c | 4 ++++ >> qemu-options.hx | 7 ++++++- >> 7 files changed, 56 insertions(+), 3 deletions(-) >> >> diff --git a/block.c b/block.c >> index 7547051..49e8186 100644 >> --- a/block.c >> +++ b/block.c >> @@ -639,6 +639,8 @@ static int bdrv_open_common(BlockDriverState *bs, const >> char *filename, >> bdrv_enable_copy_on_read(bs); >> } >> >> + bs->detect_zero = !!(flags & BDRV_O_DETECT_ZERO); >> + >> pstrcpy(bs->filename, sizeof(bs->filename), filename); >> >> if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv)) { >> @@ -654,7 +656,7 @@ static int bdrv_open_common(BlockDriverState *bs, const >> char *filename, >> * Clear flags that are internal to the block layer before opening the >> * image. >> */ >> - open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING); >> + open_flags = flags & ~(BDRV_O_SNAPSHOT | BDRV_O_NO_BACKING | >> BDRV_O_DETECT_ZERO); >> >> /* >> * Snapshots should be writable. >> @@ -1940,6 +1942,33 @@ static int coroutine_fn >> bdrv_co_do_write_zeroes(BlockDriverState *bs, >> } >> >> /* >> + * Detect zeroes: This is very conservative and could be a lot more >> + * clever/complex. >> + * >> + * All it does now is detect if the whole iovec contains nothing but >> + * zero bytes, and if so call bdrv_co_do_write_zeroes, otherwise call >> + * drv->bdrv_co_writev. >> + * >> + * It would be possible to split requests up and do this >> + * iovec-element-by-element or even sector-by-sector. >> + */ >> +static int detect_zeroes(BlockDriverState *bs, >> + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) >> +{ >> + BlockDriver *drv = bs->drv; >> + int i; >> + >> + for (i = 0; i < qiov->niov; i++) { >> + if (!buffer_is_zero (qiov->iov[i].iov_base, qiov->iov[i].iov_len)) >> + goto not_zero; >> + } >> + return bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); >> + >> + not_zero: >> + return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); >> +} >> + >> +/* >> * Handle a write request in coroutine context >> */ >> static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, >> @@ -1973,6 +2002,8 @@ static int coroutine_fn >> bdrv_co_do_writev(BlockDriverState *bs, >> >> if (flags & BDRV_REQ_ZERO_WRITE) { >> ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); >> + } else if (bs->detect_zero) { >> + ret = detect_zeroes (bs, sector_num, nb_sectors, qiov); >> } else { >> ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); >> } >> diff --git a/block.h b/block.h >> index 7408acc..fcc6de6 100644 >> --- a/block.h >> +++ b/block.h >> @@ -79,6 +79,7 @@ typedef struct BlockDevOps { >> #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ >> #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image >> */ >> #define BDRV_O_INCOMING 0x0800 /* consistency hint for incoming >> migration */ >> +#define BDRV_O_DETECT_ZERO 0x1000 /* detect zero writes */ >> >> #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB | >> BDRV_O_NO_FLUSH) >> >> diff --git a/block_int.h b/block_int.h >> index 3d4abc6..1008103 100644 >> --- a/block_int.h >> +++ b/block_int.h >> @@ -271,6 +271,7 @@ struct BlockDriverState { >> int sg; /* if true, the device is a /dev/sg* */ >> int copy_on_read; /* if true, copy read backing sectors into image >> note this is a reference count */ >> + int detect_zero; /* if true, detect all zero byte writes */ >> >> BlockDriver *drv; /* NULL means no media */ >> void *opaque; >> diff --git a/blockdev.c b/blockdev.c >> index 67895b2..580c078 100644 >> --- a/blockdev.c >> +++ b/blockdev.c >> @@ -296,6 +296,7 @@ DriveInfo *drive_init(QemuOpts *opts, int >> default_to_scsi) >> BlockIOLimit io_limits; >> int snapshot = 0; >> bool copy_on_read; >> + bool detect_zero; >> int ret; >> >> translation = BIOS_ATA_TRANSLATION_AUTO; >> @@ -313,6 +314,7 @@ DriveInfo *drive_init(QemuOpts *opts, int >> default_to_scsi) >> snapshot = qemu_opt_get_bool(opts, "snapshot", 0); >> ro = qemu_opt_get_bool(opts, "readonly", 0); >> copy_on_read = qemu_opt_get_bool(opts, "copy-on-read", false); >> + detect_zero = qemu_opt_get_bool(opts, "detect-zero", false); >> >> file = qemu_opt_get(opts, "file"); >> serial = qemu_opt_get(opts, "serial"); >> @@ -595,6 +597,10 @@ DriveInfo *drive_init(QemuOpts *opts, int >> default_to_scsi) >> bdrv_flags |= BDRV_O_COPY_ON_READ; >> } >> >> + if (detect_zero) { >> + bdrv_flags |= BDRV_O_DETECT_ZERO; >> + } >> + >> if (runstate_check(RUN_STATE_INMIGRATE)) { >> bdrv_flags |= BDRV_O_INCOMING; >> } >> @@ -612,6 +618,10 @@ DriveInfo *drive_init(QemuOpts *opts, int >> default_to_scsi) >> >> bdrv_flags |= ro ? 0 : BDRV_O_RDWR; >> >> + if (ro && detect_zero) { >> + error_report("warning: ignoring detect-zero on readonly drive"); >> + } >> + >> ret = bdrv_open(dinfo->bdrv, file, bdrv_flags, drv); >> if (ret < 0) { >> error_report("could not open disk image %s: %s", >> diff --git a/hmp-commands.hx b/hmp-commands.hx >> index 18cb415..7dcdcfe 100644 >> --- a/hmp-commands.hx >> +++ b/hmp-commands.hx >> @@ -908,7 +908,8 @@ ETEXI >> "[,unit=m][,media=d][,index=i]\n" >> "[,cyls=c,heads=h,secs=s[,trans=t]]\n" >> "[,snapshot=on|off][,cache=on|off]\n" >> - "[,readonly=on|off][,copy-on-read=on|off]", >> + "[,readonly=on|off][,copy-on-read=on|off]\n" >> + "[,detect-zero=on|off]\n", >> .help = "add drive to PCI storage controller", >> .mhandler.cmd = drive_hot_add, >> }, >> diff --git a/qemu-config.c b/qemu-config.c >> index be84a03..fb19d15 100644 >> --- a/qemu-config.c >> +++ b/qemu-config.c >> @@ -113,6 +113,10 @@ static QemuOptsList qemu_drive_opts = { >> .name = "copy-on-read", >> .type = QEMU_OPT_BOOL, >> .help = "copy read data from backing file into image file", >> + },{ >> + .name = "detect-zero", >> + .type = QEMU_OPT_BOOL, >> + .help = "detect zero writes and handle space-efficiently", >> }, >> { /* end of list */ } >> }, >> diff --git a/qemu-options.hx b/qemu-options.hx >> index 8b66264..5d5da6e 100644 >> --- a/qemu-options.hx >> +++ b/qemu-options.hx >> @@ -141,7 +141,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, >> " [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n" >> " >> [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" >> " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" >> - " [,readonly=on|off][,copy-on-read=on|off]\n" >> + " [,readonly=on|off][,copy-on-read=on|off][,detect-zero=on|off]\n" >> " >> [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]][[,iops=i]|[[,iops_rd=r][,iops_wr=w]]\n" >> " use 'file' as a drive image\n", QEMU_ARCH_ALL) >> STEXI >> @@ -196,6 +196,11 @@ Open drive @option{file} as read-only. Guest write >> attempts will fail. >> @item copy-on-read=@var{copy-on-read} >> @var{copy-on-read} is "on" or "off" and enables whether to copy read backing >> file sectors into the image file. >> +@item detect-zero=@var{detect-zero} >> +@var{detect-zero} is "on" or "off". If "on", guest writes containing >> +all zero bytes are detected and may be handled more space-efficiently by >> +the underlying block device. Under some workloads this may result in >> +worse performance, so it defaults to "off". >> @end table >> >> By default, writethrough caching is used for all block device. This means >> that >> -- >> 1.7.10.4 > >