Introduce a new driver, that works in pair with copy-before-write to improve fleecing.
Without fleecing driver, old fleecing scheme looks as follows: [guest] | |root v [copy-before-write] -----> [temp.qcow2] <--- [nbd export] | target | |file |backing v | [active disk] <-------------+ With fleecing driver, new scheme is: [guest] | |root v [copy-before-write] -----> [fleecing] <--- [nbd export] | target | | |file | |file v | v [active disk]<--source------+ [temp.img] Benefits of new scheme: 1. Access control: if remote client try to read data that not covered by original dirty bitmap used on copy-before-write open, client gets -EACCES. 2. Discard support: if remote client do DISCARD, this additionally to discarding data in temp.img informs block-copy process to not copy these clusters. Next read from discarded area will return -EACCES. This is significant thing: when fleecing user reads data that was not yet copied to temp.img, we can avoid copying it on further guest write. 3. Synchronisation between client reads and block-copy write is more efficient: it doesn't block intersecting block-copy write during client read. 4. We don't rely on backing feature: active disk should not be backing of temp image, so we avoid some permission-related difficulties and temp image now is not required to support backing, it may be simple raw image. Note that now nobody calls fleecing_drv_activate(), so new driver is actually unusable. It's a work for the following patch: support fleecing block driver in copy-before-write filter driver. Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- qapi/block-core.json | 37 +++++- block/fleecing.h | 16 +++ block/fleecing-drv.c | 261 +++++++++++++++++++++++++++++++++++++++++++ MAINTAINERS | 1 + block/meson.build | 1 + 5 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 block/fleecing-drv.c diff --git a/qapi/block-core.json b/qapi/block-core.json index 6904daeacf..b47351dbac 100644 --- a/qapi/block-core.json +++ b/qapi/block-core.json @@ -2917,13 +2917,14 @@ # @blkreplay: Since 4.2 # @compress: Since 5.0 # @copy-before-write: Since 6.2 +# @fleecing: Since 7.0 # # Since: 2.9 ## { 'enum': 'BlockdevDriver', 'data': [ 'blkdebug', 'blklogwrites', 'blkreplay', 'blkverify', 'bochs', 'cloop', 'compress', 'copy-before-write', 'copy-on-read', 'dmg', - 'file', 'ftp', 'ftps', 'gluster', + 'file', 'fleecing', 'ftp', 'ftps', 'gluster', {'name': 'host_cdrom', 'if': 'HAVE_HOST_BLOCK_DEVICE' }, {'name': 'host_device', 'if': 'HAVE_HOST_BLOCK_DEVICE' }, 'http', 'https', 'iscsi', @@ -4181,6 +4182,39 @@ 'base': 'BlockdevOptionsGenericFormat', 'data': { 'target': 'BlockdevRef', '*bitmap': 'BlockDirtyBitmap' } } +## +# @BlockdevOptionsFleecing: +# +# Driver that works in pair with copy-before-write filter to make a fleecing +# scheme like this: +# +# [guest] +# | +# |root +# v +# [copy-before-write] -----> [fleecing] <--- [nbd export] +# | target | | +# |file | |file +# v | v +# [active disk]<--source------+ [temp.img] +# +# The scheme works like this: on write, fleecing driver saves data to its +# ``file`` child and remember that this data is in ``file`` child. On read +# fleecing reads from ``file`` child if data is already stored to it and +# otherwise it reads from ``source`` child. +# In the same time, before each guest write, ``copy-before-write`` copies +# corresponding old data from ``active disk`` to ``fleecing`` node. +# This way, ``fleecing`` node looks like a kind of snapshot for extenal +# reader like NBD export. +# +# @source: node name of source node of fleecing scheme +# +# Since: 7.0 +## +{ 'struct': 'BlockdevOptionsFleecing', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { 'source': 'str' } } + ## # @BlockdevOptions: # @@ -4237,6 +4271,7 @@ 'copy-on-read':'BlockdevOptionsCor', 'dmg': 'BlockdevOptionsGenericFormat', 'file': 'BlockdevOptionsFile', + 'fleecing': 'BlockdevOptionsFleecing', 'ftp': 'BlockdevOptionsCurlFtp', 'ftps': 'BlockdevOptionsCurlFtps', 'gluster': 'BlockdevOptionsGluster', diff --git a/block/fleecing.h b/block/fleecing.h index fb7b2f86c4..75ad2f8b19 100644 --- a/block/fleecing.h +++ b/block/fleecing.h @@ -80,6 +80,9 @@ #include "block/block-copy.h" #include "block/reqlist.h" + +/* fleecing.c */ + typedef struct FleecingState FleecingState; /* @@ -132,4 +135,17 @@ void fleecing_discard(FleecingState *f, int64_t offset, int64_t bytes); void fleecing_mark_done_and_wait_readers(FleecingState *f, int64_t offset, int64_t bytes); + +/* fleecing-drv.c */ + +/* Returns true if @bs->drv is fleecing block driver */ +bool is_fleecing_drv(BlockDriverState *bs); + +/* + * Normally FleecingState is created by copy-before-write filter. Then + * copy-before-write filter calls fleecing_drv_activate() to share FleecingState + * with fleecing block driver. + */ +void fleecing_drv_activate(BlockDriverState *bs, FleecingState *fleecing); + #endif /* FLEECING_H */ diff --git a/block/fleecing-drv.c b/block/fleecing-drv.c new file mode 100644 index 0000000000..202208bb03 --- /dev/null +++ b/block/fleecing-drv.c @@ -0,0 +1,261 @@ +/* + * fleecing block driver + * + * Copyright (c) 2021 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsement...@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "sysemu/block-backend.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "block/block_int.h" +#include "block/coroutines.h" +#include "block/qdict.h" +#include "block/block-copy.h" +#include "block/reqlist.h" + +#include "block/copy-before-write.h" +#include "block/fleecing.h" + +typedef struct BDRVFleecingState { + FleecingState *fleecing; + BdrvChild *source; +} BDRVFleecingState; + +static coroutine_fn int fleecing_co_preadv_part( + BlockDriverState *bs, int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags) +{ + BDRVFleecingState *s = bs->opaque; + const BlockReq *req; + int ret; + + if (!s->fleecing) { + /* fleecing_drv_activate() was not called */ + return -EINVAL; + } + + /* TODO: upgrade to async loop using AioTask */ + while (bytes) { + int64_t cur_bytes; + + ret = fleecing_read_lock(s->fleecing, offset, bytes, &req, &cur_bytes); + if (ret < 0) { + return ret; + } + + if (req) { + ret = bdrv_co_preadv_part(s->source, offset, cur_bytes, + qiov, qiov_offset, flags); + fleecing_read_unlock(s->fleecing, req); + } else { + ret = bdrv_co_preadv_part(bs->file, offset, cur_bytes, + qiov, qiov_offset, flags); + } + if (ret < 0) { + return ret; + } + + bytes -= cur_bytes; + offset += cur_bytes; + qiov_offset += cur_bytes; + } + + return 0; +} + +static int coroutine_fn fleecing_co_block_status(BlockDriverState *bs, + bool want_zero, int64_t offset, + int64_t bytes, int64_t *pnum, + int64_t *map, + BlockDriverState **file) +{ + BDRVFleecingState *s = bs->opaque; + const BlockReq *req = NULL; + int ret; + int64_t cur_bytes; + + if (!s->fleecing) { + /* fleecing_drv_activate() was not called */ + return -EINVAL; + } + + ret = fleecing_read_lock(s->fleecing, offset, bytes, &req, &cur_bytes); + if (ret < 0) { + return ret; + } + + *pnum = cur_bytes; + *map = offset; + + if (req) { + *file = s->source->bs; + fleecing_read_unlock(s->fleecing, req); + } else { + *file = bs->file->bs; + } + + return ret; +} + +static int coroutine_fn fleecing_co_pdiscard(BlockDriverState *bs, + int64_t offset, int64_t bytes) +{ + BDRVFleecingState *s = bs->opaque; + if (!s->fleecing) { + /* fleecing_drv_activate() was not called */ + return -EINVAL; + } + + fleecing_discard(s->fleecing, offset, bytes); + + bdrv_co_pdiscard(bs->file, offset, bytes); + + /* + * Ignore bdrv_co_pdiscard() result: fleecing_discard() succeeded, that + * means that next read from this area will fail with -EACCES. More correct + * to report success now. + */ + return 0; +} + +static int coroutine_fn fleecing_co_pwrite_zeroes(BlockDriverState *bs, + int64_t offset, int64_t bytes, BdrvRequestFlags flags) +{ + BDRVFleecingState *s = bs->opaque; + if (!s->fleecing) { + /* fleecing_drv_activate() was not called */ + return -EINVAL; + } + + /* + * TODO: implement cache, to have a chance to fleecing user to read and + * discard this data before actual writing to temporary image. + */ + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); +} + +static coroutine_fn int fleecing_co_pwritev(BlockDriverState *bs, + int64_t offset, + int64_t bytes, + QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BDRVFleecingState *s = bs->opaque; + if (!s->fleecing) { + /* fleecing_drv_activate() was not called */ + return -EINVAL; + } + + /* + * TODO: implement cache, to have a chance to fleecing user to read and + * discard this data before actual writing to temporary image. + */ + return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags); +} + + +static void fleecing_refresh_filename(BlockDriverState *bs) +{ + pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), + bs->file->bs->filename); +} + +static int fleecing_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVFleecingState *s = bs->opaque; + + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, + BDRV_CHILD_DATA | BDRV_CHILD_PRIMARY, + false, errp); + if (!bs->file) { + return -EINVAL; + } + + s->source = bdrv_open_child(NULL, options, "source", bs, &child_of_bds, + BDRV_CHILD_DATA, false, errp); + if (!s->source) { + return -EINVAL; + } + + bs->total_sectors = bs->file->bs->total_sectors; + + return 0; +} + +static void fleecing_child_perm(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, + BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared) +{ + bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared); + + if (role & BDRV_CHILD_PRIMARY) { + *nshared &= BLK_PERM_CONSISTENT_READ; + } else { + *nperm &= BLK_PERM_CONSISTENT_READ; + + /* + * copy-before-write filter is responsible for source child and need + * write access to it. + */ + *nshared |= BLK_PERM_WRITE; + } +} + +BlockDriver bdrv_fleecing_drv = { + .format_name = "fleecing", + .instance_size = sizeof(BDRVFleecingState), + + .bdrv_open = fleecing_open, + + .bdrv_co_preadv_part = fleecing_co_preadv_part, + .bdrv_co_pwritev = fleecing_co_pwritev, + .bdrv_co_pwrite_zeroes = fleecing_co_pwrite_zeroes, + .bdrv_co_pdiscard = fleecing_co_pdiscard, + .bdrv_co_block_status = fleecing_co_block_status, + + .bdrv_refresh_filename = fleecing_refresh_filename, + + .bdrv_child_perm = fleecing_child_perm, +}; + +bool is_fleecing_drv(BlockDriverState *bs) +{ + return bs && bs->drv == &bdrv_fleecing_drv; +} + +void fleecing_drv_activate(BlockDriverState *bs, FleecingState *fleecing) +{ + BDRVFleecingState *s = bs->opaque; + + assert(is_fleecing_drv(bs)); + + s->fleecing = fleecing; +} + +static void fleecing_init(void) +{ + bdrv_register(&bdrv_fleecing_drv); +} + +block_init(fleecing_init); diff --git a/MAINTAINERS b/MAINTAINERS index 78ea04e292..42dc979052 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2425,6 +2425,7 @@ F: block/copy-before-write.h F: block/copy-before-write.c F: block/fleecing.h F: block/fleecing.c +F: block/fleecing-drv.c F: include/block/aio_task.h F: block/aio_task.c F: util/qemu-co-shared-resource.c diff --git a/block/meson.build b/block/meson.build index d30da90a01..b493580fbe 100644 --- a/block/meson.build +++ b/block/meson.build @@ -19,6 +19,7 @@ block_ss.add(files( 'dirty-bitmap.c', 'filter-compress.c', 'fleecing.c', + 'fleecing-drv.c', 'io.c', 'mirror.c', 'nbd.c', -- 2.31.1