this patch tries to optimize zero write requests by automatically using bdrv_write_zeroes if it is supported by the format.
This significantly speeds up file system initialization and should speed zero write test used to test backend storage performance. I ran the following 2 tests on my internal SSD with a 50G QCOW2 container and on an attached iSCSI storage. a) mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/vdX QCOW2 [off] [on] [unmap] ----- runtime: 14secs 1.1secs 1.1secs filesize: 937M 18M 18M iSCSI [off] [on] [unmap] ---- runtime: 9.3s 0.9s 0.9s b) dd if=/dev/zero of=/dev/vdX bs=1M oflag=direct QCOW2 [off] [on] [unmap] ----- runtime: 246secs 18secs 18secs filesize: 51G 192K 192K throughput: 203M/s 2.3G/s 2.3G/s iSCSI* [off] [on] [unmap] ---- runtime: 8mins 45secs 33secs throughput: 106M/s 1.2G/s 1.6G/s allocated: 100% 100% 0% * The storage was connected via an 1Gbit interface. It seems to internally handle writing zeroes via WRITESAME16 very fast. Signed-off-by: Peter Lieven <p...@kamp.de> --- v1->v2: - added tests to commit message (Markus) RFCv2->v1: - fixed paramter parsing strncmp -> strcmp (Eric) - fixed typo (choosen->chosen) (Eric) - added second example to commit msg RFCv1->RFCv2: - add detect-zeroes=off|on|unmap knob to drive cmdline parameter - call zero detection only for format (bs->file != NULL) block.c | 39 ++++++++++++++++++++++++++++++++++++++- include/block/block_int.h | 12 ++++++++++++ include/qemu-common.h | 1 + qemu-options.hx | 6 ++++++ util/iov.c | 21 +++++++++++++++++++++ 5 files changed, 78 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index acb70fd..3f61f76 100644 --- a/block.c +++ b/block.c @@ -817,6 +817,25 @@ static int bdrv_assign_node_name(BlockDriverState *bs, return 0; } +static int bdrv_set_detect_zeroes(BlockDriverState *bs, + const char *detect_zeroes, + Error **errp) +{ + if (!detect_zeroes || !strcmp(detect_zeroes, "off")) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_OFF; + } else if (!strcmp(detect_zeroes, "on")) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_ON; + } else if (!strcmp(detect_zeroes, "unmap")) { + bs->detect_zeroes = BDRV_DETECT_ZEROES_UNMAP; + } else { + error_setg(errp, "invalid value for detect-zeroes: %s", + detect_zeroes); + return -EINVAL; + } + + return 0; +} + /* * Common part for opening disk images and files * @@ -827,7 +846,7 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, { int ret, open_flags; const char *filename; - const char *node_name = NULL; + const char *node_name = NULL, *detect_zeroes = NULL; Error *local_err = NULL; assert(drv != NULL); @@ -855,6 +874,13 @@ static int bdrv_open_common(BlockDriverState *bs, BlockDriverState *file, } qdict_del(options, "node-name"); + detect_zeroes = qdict_get_try_str(options, "detect-zeroes"); + ret = bdrv_set_detect_zeroes(bs, detect_zeroes, errp); + if (ret < 0) { + return ret; + } + qdict_del(options, "detect-zeroes"); + /* bdrv_open() with directly using a protocol as drv. This layer is already * opened, so assign it to bs (while file becomes a closed BlockDriverState) * and return immediately. */ @@ -3179,6 +3205,17 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + if (!ret && bs->detect_zeroes != BDRV_DETECT_ZEROES_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && bs->file && + drv->bdrv_co_write_zeroes && qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + /* if the device was not opened with discard=on the below flag + * is immediately cleared again in bdrv_co_do_write_zeroes */ + if (bs->detect_zeroes == BDRV_DETECT_ZEROES_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { diff --git a/include/block/block_int.h b/include/block/block_int.h index cd5bc73..7a3013a 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -271,6 +271,17 @@ typedef struct BlockLimits { } BlockLimits; /* + * Different operation modes for automatic zero detection + * to speed the write operation up with bdrv_write_zeroes. + */ +typedef enum { + BDRV_DETECT_ZEROES_OFF = 0x0, + BDRV_DETECT_ZEROES_ON = 0x1, + /* also set the BDRV_MAY_UNMAP flag with bdrv_write_zeroes */ + BDRV_DETECT_ZEROES_UNMAP = 0x2, +} BdrvDetectZeroes; + +/* * Note: the function bdrv_append() copies and swaps contents of * BlockDriverStates, so if you add new fields to this struct, please * inspect bdrv_append() to determine if the new fields need to be @@ -365,6 +376,7 @@ struct BlockDriverState { BlockJob *job; QDict *options; + BdrvDetectZeroes detect_zeroes; }; int get_tmp_filename(char *filename, int size); diff --git a/include/qemu-common.h b/include/qemu-common.h index c8a58a8..574da73 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -330,6 +330,7 @@ void qemu_iovec_concat(QEMUIOVector *dst, void qemu_iovec_concat_iov(QEMUIOVector *dst, struct iovec *src_iov, unsigned int src_cnt, size_t soffset, size_t sbytes); +bool qemu_iovec_is_zero(QEMUIOVector *qiov); void qemu_iovec_destroy(QEMUIOVector *qiov); void qemu_iovec_reset(QEMUIOVector *qiov); size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset, diff --git a/qemu-options.hx b/qemu-options.hx index ee5437b..f824d9b 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -410,6 +410,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive, " [,cache=writethrough|writeback|none|directsync|unsafe][,format=f]\n" " [,serial=s][,addr=A][,id=name][,aio=threads|native]\n" " [,readonly=on|off][,copy-on-read=on|off]\n" + " [,detect-zeroes=on|off|unmap]\n" " [[,bps=b]|[[,bps_rd=r][,bps_wr=w]]]\n" " [[,iops=i]|[[,iops_rd=r][,iops_wr=w]]]\n" " [[,bps_max=bm]|[[,bps_rd_max=rm][,bps_wr_max=wm]]]\n" @@ -470,6 +471,11 @@ Open drive @option{file} as read-only. Guest write attempts will fail. @item copy-on-read=@var{copy-on-read} @var{copy-on-read} is "on" or "off" and enables whether to copy read backing file sectors into the image file. +@item detect-zeroes=@var{detect-zeroes} +@var{detect-zeroes} is "off", "on" or "unmap" and enables the automatic +conversion of plain zero writes by the OS to driver specific optimized +zero write commands. If "unmap" is chosen and @var{discard} is "on" +a zero write may even be converted to an UNMAP operation. @end table By default, the @option{cache=writeback} mode is used. It will report data diff --git a/util/iov.c b/util/iov.c index 6569b5a..0b17392 100644 --- a/util/iov.c +++ b/util/iov.c @@ -335,6 +335,27 @@ void qemu_iovec_concat(QEMUIOVector *dst, qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes); } +/* + * check if the contents of the iovecs is all zero + */ +bool qemu_iovec_is_zero(QEMUIOVector *qiov) +{ + int i; + for (i = 0; i < qiov->niov; i++) { + size_t offs = qiov->iov[i].iov_len & ~(4 * sizeof(long) - 1); + uint8_t *ptr = qiov->iov[i].iov_base; + if (offs && !buffer_is_zero(qiov->iov[i].iov_base, offs)) { + return false; + } + for (; offs < qiov->iov[i].iov_len; offs++) { + if (ptr[offs]) { + return false; + } + } + } + return true; +} + void qemu_iovec_destroy(QEMUIOVector *qiov) { assert(qiov->nalloc != -1); -- 1.7.9.5