this patch tries to optimize zero write requests by automatically using bdrv_write_zeroes if it is supported by the format.
this should significantly speed up file system initialization and should speed zero write test used to test backend storage performance. the difference can simply be tested by e.g. dd if=/dev/zero of=/dev/vdX bs=1M Signed-off-by: Peter Lieven <p...@kamp.de> --- v1->v2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter - call zero detection only for format (bs->file != NULL) block.c | 39 ++++++++++++++++++++++++++++++++++++++- include/block/block_int.h | 12 ++++++++++++ include/qemu-common.h | 1 + qemu-options.hx | 6 ++++++ util/iov.c | 21 +++++++++++++++++++++ 5 files changed, 78 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index acb70fd..6bda7ce 100644 --- a/block.c +++ b/block.c @@ -817,6 +817,25 @@ static int bdrv_assign_node_name(BlockDriverState *bs, return 0; } +static int bdrv_set_detect_zeroes(BlockDriverState *bs, + const char *detect_zeroes, + Error **errp) +{ + if (!detect_zeroes || !strncmp(detect_zeroes, "off", 3)) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF; + } else if (!strncmp(detect_zeroes, "on", 2)) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_ON; + } else if (!strncmp(detect_zeroes, "unmap", 5)) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP; + } else { + error_setg(errp, "invalid value for detect-zeroes: %s", + detect_zeroes); + return -EINVAL; + } + + return 0; +} + /* * Common part for opening disk images and files * @@ -827,7 +846,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, { int ret, open_flags; const char *filename; - const char *node_name = NULL; + const char *node_name = NULL, *detect_zeroes = NULL; Error *local_err = NULL; assert(drv != NULL); @@ -855,6 +874,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } qdict_del(options, "node-name"); + detect_zeroes = qdict_get_try_str(options, "detect-zeroes"); + ret = bdrv_set_detect_zeroes(bs, detect_zeroes, errp); + if (ret < 0) { + return ret; + } + qdict_del(options, "detect-zeroes"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ @@ -3179,6 +3205,17 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + if (!ret && bs->detect_zeroes != BDRV_DETECT_ZEROES_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && bs->file && + drv->bdrv_co_write_zeroes && qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + /* if the device was not opened with discard=on the below flag + * is immediately cleared again in bdrv_co_do_write_zeroes */ + if (bs->detect_zeroes == BDRV_DETECT_ZEROES_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { diff --git a/include/block/block_int.h b/include/block/block_int.h index cd5bc73..7a3013a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -271,6 +271,17 @@ typedef struct BlockLimits { } BlockLimits; /* + * Different operation modes for automatic zero detection + * to speed the write operation up with bdrv_write_zeroes. + */ +typedef enum { + BDRV_DETECT_ZEROES_OFF = 0x0, + BDRV_DETECT_ZEROES_ON = 0x1, + /* also set the BDRV_MAY_UNMAP flag with bdrv_write_zeroes */ + BDRV_DETECT_ZEROES_UNMAP = 0x2, +} BdrvDetectZeroes; + +/* * Note: the function bdrv_append() copies and swaps contents of * BlockDriverStates, so if you add new fields to this struct, please * inspect bdrv_append() to determine if the new fields need to be @@ -365,6 +376,7 @@ struct BlockDriverState { BlockJob *job; QDict *options; + BdrvDetectZeroes detect_zeroes; }; int get_tmp_filename(char *filename, int size); diff --git a/include/qemu-common.h b/include/qemu-common.h index c8a58a8..574da73 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -330,6 +330,7 @@ void qemu_iovec_concat(QEMUIOVector *dst, void qemu_iovec_concat_iov(QEMUIOVector *dst, struct iovec *src_iov, unsigned int src_cnt, size_t soffset, size_t sbytes); +bool qemu_iovec_is_zero(QEMUIOVector *qiov); void qemu_iovec_destroy(QEMUIOVector *qiov); void qemu_iovec_reset(QEMUIOVector *qiov); size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset, diff --git a/qemu-options.hx b/qemu-options.hx index ee5437b..1836307 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -410,6 +410,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" + " [,detect-zeroes=on|off|unmap]\n" " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n" " [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n" " [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n" @@ -470,6 +471,11 @@ Open drive @option{file} as read-only. Guest write attempts will fail. @item copy-on-read=@var{copy-on-read} @var{copy-on-read} is "on" or "off" and enables whether to copy read backing file sectors into the image file. +@item detect-zeroes=@var{detect-zeroes} +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic +conversion of plain zero writes by the OS to driver specific optimized +zero write commands. If "unmap" is choosen and @var{discard} is "on" +a zero write may even be converted to an UNMAP operation. @end table By default, the @option{cache=writeback} mode is used. It will report data diff --git a/util/iov.c b/util/iov.c index 6569b5a..0b17392 100644 --- a/util/iov.c +++ b/util/iov.c @@ -335,6 +335,27 @@ void qemu_iovec_concat(QEMUIOVector *dst, qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes); } +/* + * check if the contents of the iovecs is all zero + */ +bool qemu_iovec_is_zero(QEMUIOVector *qiov) +{ + int i; + for (i = 0; i < qiov->niov; i++) { + size_t offs = qiov->iov[i].iov_len & ~(4 * sizeof(long) - 1); + uint8_t *ptr = qiov->iov[i].iov_base; + if (offs && !buffer_is_zero(qiov->iov[i].iov_base, offs)) { + return false; + } + for (; offs < qiov->iov[i].iov_len; offs++) { + if (ptr[offs]) { + return false; + } + } + } + return true; +} + void qemu_iovec_destroy(QEMUIOVector *qiov) { assert(qiov->nalloc != -1); -- 1.7.9.5