On Fri, 03/28 15:46, Peter Lieven wrote: > this patch tries to optimize zero write requests > by automatically using bdrv_write_zeroes if it is > supported by the format. > > This significantly speeds up file system initialization and > should speed zero write test used to test backend storage > performance. > > I ran the following 2 tests on my internal SSD with a > 50G QCOW2 container and on an attached iSCSI storage. > > a) mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vdX > > QCOW2 [off] [on] [unmap] > ----- > runtime: 14secs 1.1secs 1.1secs > filesize: 937M 18M 18M > > iSCSI [off] [on] [unmap] > ---- > runtime: 9.3s 0.9s 0.9s > > b) dd if=/dev/zero of=/dev/vdX bs=1M oflag=direct > > QCOW2 [off] [on] [unmap] > ----- > runtime: 246secs 18secs 18secs > filesize: 51G 192K 192K > throughput: 203M/s 2.3G/s 2.3G/s > > iSCSI* [off] [on] [unmap] > ---- > runtime: 8mins 45secs 33secs > throughput: 106M/s 1.2G/s 1.6G/s > allocated: 100% 100% 0% > > * The storage was connected via an 1Gbit interface. > It seems to internally handle writing zeroes > via WRITESAME16 very fast. > > Signed-off-by: Peter Lieven <p...@kamp.de> > --- > v1->v2: - added tests to commit message (Markus) > RFCv2->v1: - fixed paramter parsing strncmp -> strcmp (Eric) > - fixed typo (choosen->chosen) (Eric) > - added second example to commit msg > > RFCv1->RFCv2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter > - call zero detection only for format (bs->file != NULL) > > block.c | 39 ++++++++++++++++++++++++++++++++++++++- > include/block/block_int.h | 12 ++++++++++++ > include/qemu-common.h | 1 + > qemu-options.hx | 6 ++++++ > util/iov.c | 21 +++++++++++++++++++++ > 5 files changed, 78 insertions(+), 1 deletion(-) > > diff --git a/block.c b/block.c > index acb70fd..3f61f76 100644 > --- a/block.c > +++ b/block.c > @@ -817,6 +817,25 @@ static int bdrv_assign_node_name(BlockDriverState *bs, > return 0; > } > > +static int bdrv_set_detect_zeroes(BlockDriverState *bs, > + const char *detect_zeroes, > + Error **errp) > +{ > + if (!detect_zeroes || !strcmp(detect_zeroes, "off")) { > + bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF; > + } else if (!strcmp(detect_zeroes, "on")) { > + bs->detect_zeroes = BDRV_DETECT_ZEROES_ON; > + } else if (!strcmp(detect_zeroes, "unmap")) { > + bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP; > + } else { > + error_setg(errp, "invalid value for detect-zeroes: %s", > + detect_zeroes); > + return -EINVAL; > + } > + > + return 0; > +} > + > /* > * Common part for opening disk images and files > * > @@ -827,7 +846,7 @@ static int bdrv_open_common(BlockDriverState *bs, > BlockDriverState *file, > { > int ret, open_flags; > const char *filename; > - const char *node_name = NULL; > + const char *node_name = NULL, *detect_zeroes = NULL; > Error *local_err = NULL; > > assert(drv != NULL); > @@ -855,6 +874,13 @@ static int bdrv_open_common(BlockDriverState *bs, > BlockDriverState *file, > } > qdict_del(options, "node-name"); > > + detect_zeroes = qdict_get_try_str(options, "detect-zeroes"); > + ret = bdrv_set_detect_zeroes(bs, detect_zeroes, errp); > + if (ret < 0) { > + return ret; > + } > + qdict_del(options, "detect-zeroes"); > + > /* bdrv_open() with directly using a protocol as drv. This layer is > already > * opened, so assign it to bs (while file becomes a closed > BlockDriverState) > * and return immediately. */ > @@ -3179,6 +3205,17 @@ static int coroutine_fn > bdrv_aligned_pwritev(BlockDriverState *bs, > > ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); > > + if (!ret && bs->detect_zeroes != BDRV_DETECT_ZEROES_OFF && > + !(flags & BDRV_REQ_ZERO_WRITE) && bs->file && > + drv->bdrv_co_write_zeroes && qemu_iovec_is_zero(qiov)) { > + flags |= BDRV_REQ_ZERO_WRITE; > + /* if the device was not opened with discard=on the below flag > + * is immediately cleared again in bdrv_co_do_write_zeroes */ > + if (bs->detect_zeroes == BDRV_DETECT_ZEROES_UNMAP) { > + flags |= BDRV_REQ_MAY_UNMAP; > + } > + } > + > if (ret < 0) { > /* Do nothing, write notifier decided to fail this request */ > } else if (flags & BDRV_REQ_ZERO_WRITE) { > diff --git a/include/block/block_int.h b/include/block/block_int.h > index cd5bc73..7a3013a 100644 > --- a/include/block/block_int.h > +++ b/include/block/block_int.h > @@ -271,6 +271,17 @@ typedef struct BlockLimits { > } BlockLimits; > > /* > + * Different operation modes for automatic zero detection > + * to speed the write operation up with bdrv_write_zeroes. > + */ > +typedef enum { > + BDRV_DETECT_ZEROES_OFF = 0x0, > + BDRV_DETECT_ZEROES_ON = 0x1, > + /* also set the BDRV_MAY_UNMAP flag with bdrv_write_zeroes */ > + BDRV_DETECT_ZEROES_UNMAP = 0x2, > +} BdrvDetectZeroes; > + > +/* > * Note: the function bdrv_append() copies and swaps contents of > * BlockDriverStates, so if you add new fields to this struct, please > * inspect bdrv_append() to determine if the new fields need to be > @@ -365,6 +376,7 @@ struct BlockDriverState { > BlockJob *job; > > QDict *options; > + BdrvDetectZeroes detect_zeroes; > }; > > int get_tmp_filename(char *filename, int size); > diff --git a/include/qemu-common.h b/include/qemu-common.h > index c8a58a8..574da73 100644 > --- a/include/qemu-common.h > +++ b/include/qemu-common.h > @@ -330,6 +330,7 @@ void qemu_iovec_concat(QEMUIOVector *dst, > void qemu_iovec_concat_iov(QEMUIOVector *dst, > struct iovec *src_iov, unsigned int src_cnt, > size_t soffset, size_t sbytes); > +bool qemu_iovec_is_zero(QEMUIOVector *qiov); > void qemu_iovec_destroy(QEMUIOVector *qiov); > void qemu_iovec_reset(QEMUIOVector *qiov); > size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset, > diff --git a/qemu-options.hx b/qemu-options.hx > index ee5437b..f824d9b 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -410,6 +410,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, > " > [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" > " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" > " [,readonly=on|off][,copy-on-read=on|off]\n" > + " [,detect-zeroes=on|off|unmap]\n" > " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n" > " [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n" > " [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n" > @@ -470,6 +471,11 @@ Open drive @option{file} as read-only. Guest write > attempts will fail. > @item copy-on-read=@var{copy-on-read} > @var{copy-on-read} is "on" or "off" and enables whether to copy read backing > file sectors into the image file. > +@item detect-zeroes=@var{detect-zeroes} > +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic > +conversion of plain zero writes by the OS to driver specific optimized > +zero write commands. If "unmap" is chosen and @var{discard} is "on" > +a zero write may even be converted to an UNMAP operation. > @end table > > By default, the @option{cache=writeback} mode is used. It will report data > diff --git a/util/iov.c b/util/iov.c > index 6569b5a..0b17392 100644 > --- a/util/iov.c > +++ b/util/iov.c > @@ -335,6 +335,27 @@ void qemu_iovec_concat(QEMUIOVector *dst, > qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes); > } > > +/* > + * check if the contents of the iovecs is all zero
contents of ... are? content of ... is? Other than that, Reviewed-by: Fam Zheng <f...@redhat.com> > + */ > +bool qemu_iovec_is_zero(QEMUIOVector *qiov) > +{ > + int i; > + for (i = 0; i < qiov->niov; i++) { > + size_t offs = qiov->iov[i].iov_len & ~(4 * sizeof(long) - 1); > + uint8_t *ptr = qiov->iov[i].iov_base; > + if (offs && !buffer_is_zero(qiov->iov[i].iov_base, offs)) { > + return false; > + } > + for (; offs < qiov->iov[i].iov_len; offs++) { > + if (ptr[offs]) { > + return false; > + } > + } > + } > + return true; > +} > + > void qemu_iovec_destroy(QEMUIOVector *qiov) > { > assert(qiov->nalloc != -1); > -- > 1.7.9.5 >