FleecingState represents state shared between copy-before-write filter and upcoming fleecing block driver.
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> --- block/fleecing.h | 135 ++++++++++++++++++++++++++++++++++ block/fleecing.c | 182 ++++++++++++++++++++++++++++++++++++++++++++++ MAINTAINERS | 2 + block/meson.build | 1 + 4 files changed, 320 insertions(+) create mode 100644 block/fleecing.h create mode 100644 block/fleecing.c diff --git a/block/fleecing.h b/block/fleecing.h new file mode 100644 index 0000000000..fb7b2f86c4 --- /dev/null +++ b/block/fleecing.h @@ -0,0 +1,135 @@ +/* + * FleecingState + * + * The common state of image fleecing, shared between copy-before-write filter + * and fleecing block driver. + * + * Copyright (c) 2021 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsement...@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * + * Fleecing scheme looks as follows: + * + * [guest blk] [nbd export] + * | | + * |root | + * v v + * [copy-before-write]--target-->[fleecing drv] + * | / | + * |file / |file + * v / v + * [active disk]<--source-----/ [temp disk] + * + * Note that "active disk" is also called just "source" and "temp disk" is also + * called "target". + * + * What happens here: + * + * copy-before-write filter performs copy-before-write operations: on guest + * write we should copy old data to target child before rewriting. Note that we + * write this data through fleecing driver: it saves a possibility to implement + * a kind of cache in fleecing driver in future. + * + * Fleecing user is nbd export: it can read from fleecing node, which guarantees + * a snapshot-view for fleecing user. Fleecing user may also do discard + * operations. + * + * FleecingState is responsible for most of the fleecing logic: + * + * 1. Fleecing read. Handle reads of fleecing user: we should decide where from + * to read, from source node or from copy-before-write target node. In former + * case we need to synchronize with guest writes. See fleecing_read_lock() and + * fleecing_read_unlock() functionality. + * + * 2. Guest write synchronization (part of [1] actually). See + * fleecing_mark_done_and_wait_readers() + * + * 3. Fleecing discard. Used by fleecing user when corresponding area is already + * copied. Fleecing user may discard the area which is not needed anymore, that + * should result in: + * - discarding data to free disk space + * - clear bits in copy-bitmap of block-copy, to avoid extra copy-before-write + * operations + * - clear bits in access-bitmap of FleecingState, to avoid further wrong + * access + * + * Still, FleecingState doesn't own any block children, so all real io + * operations (reads, writes and discards) are done by copy-before-write filter + * and fleecing block driver. + */ + +#ifndef FLEECING_H +#define FLEECING_H + +#include "block/block_int.h" +#include "block/block-copy.h" +#include "block/reqlist.h" + +typedef struct FleecingState FleecingState; + +/* + * Create FleecingState. + * + * @bcs: link to block-copy owned by copy-before-write filter. + * + * @fleecing_node: should be fleecing block driver node. Used to create some + * bitmaps in it. + */ +FleecingState *fleecing_new(BlockCopyState *bcs, + BlockDriverState *fleecing_node, + Error **errp); + +/* Free the state. Doesn't free block-copy state (@bcs) */ +void fleecing_free(FleecingState *s); + +/* + * Convenient function for thous who want to do fleecing read. + * + * If requested region starts in "done" area, i.e. data is already copied to + * copy-before-write target node, req is set to NULL, pnum is set to available + * bytes to read from target. User is free to read @pnum bytes from target. + * Still, user is responsible for concurrent discards on target. + * + * If requests region starts in "not done" area, i.e. we have to read from + * source node directly, than @pnum bytes of source node are frozen and + * guaranteed not be rewritten until user calls cbw_snapshot_read_unlock(). + * + * Returns 0 on success and -EACCES when try to read non-dirty area of + * access_bitmap. + */ +int fleecing_read_lock(FleecingState *f, int64_t offset, + int64_t bytes, const BlockReq **req, int64_t *pnum); +/* Called as closing pair for fleecing_read_lock() */ +void fleecing_read_unlock(FleecingState *f, const BlockReq *req); + +/* + * Called when fleecing user doesn't need the region anymore (for example the + * region is successfully read and backed up somewhere). + * This prevents extra copy-before-write operations in this area in future. + * Next fleecing read from this area will fail with -EACCES. + */ +void fleecing_discard(FleecingState *f, int64_t offset, int64_t bytes); + +/* + * Called by copy-before-write filter after successful copy-before-write + * operation to synchronize with parallel fleecing reads. + */ +void fleecing_mark_done_and_wait_readers(FleecingState *f, int64_t offset, + int64_t bytes); + +#endif /* FLEECING_H */ diff --git a/block/fleecing.c b/block/fleecing.c new file mode 100644 index 0000000000..f75d11b892 --- /dev/null +++ b/block/fleecing.c @@ -0,0 +1,182 @@ +/* + * FleecingState + * + * The common state of image fleecing, shared between copy-before-write filter + * and fleecing block driver. + * + * Copyright (c) 2021 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsement...@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "sysemu/block-backend.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "block/block_int.h" +#include "block/coroutines.h" +#include "block/qdict.h" +#include "block/block-copy.h" +#include "block/reqlist.h" + +#include "block/fleecing.h" + +/* + * @bcs: link to block-copy state owned by copy-before-write filter which + * performs copy-before-write operations in context of fleecing scheme. + * FleecingState doesn't own the block-copy state and don't free it on cleanup. + * + * @lock: protects access to @access_bitmap, @done_bitmap and @frozen_read_reqs + * + * @access_bitmap: represents areas allowed for reading by fleecing user. + * Reading from non-dirty areas leads to -EACCES. Discard operation among other + * things clears corresponding bits in this bitmaps. + * + * @done_bitmap: represents areas that was successfully copied by + * copy-before-write operations. So, for dirty areas fleecing user should read + * from target node and for clear areas - from source node. + * + * @frozen_read_reqs: current read requests for fleecing user in source node. + * corresponding areas must not be rewritten by guest. + */ +typedef struct FleecingState { + BlockCopyState *bcs; + + CoMutex lock; + + BdrvDirtyBitmap *access_bitmap; + BdrvDirtyBitmap *done_bitmap; + + BlockReqList frozen_read_reqs; +} FleecingState; + +FleecingState *fleecing_new(BlockCopyState *bcs, + BlockDriverState *fleecing_node, + Error **errp) +{ + BdrvDirtyBitmap *bcs_bitmap = block_copy_dirty_bitmap(bcs), + *done_bitmap, *access_bitmap; + int64_t cluster_size = block_copy_cluster_size(bcs); + FleecingState *s; + + /* done_bitmap starts empty */ + done_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL, + errp); + if (!done_bitmap) { + return NULL; + } + bdrv_disable_dirty_bitmap(done_bitmap); + + /* access_bitmap starts equal to bcs_bitmap */ + access_bitmap = bdrv_create_dirty_bitmap(fleecing_node, cluster_size, NULL, + errp); + if (!access_bitmap) { + return NULL; + } + bdrv_disable_dirty_bitmap(access_bitmap); + if (!bdrv_dirty_bitmap_merge_internal(access_bitmap, bcs_bitmap, + NULL, true)) + { + return NULL; + } + + s = g_new(FleecingState, 1); + *s = (FleecingState) { + .bcs = bcs, + .done_bitmap = done_bitmap, + .access_bitmap = access_bitmap, + }; + qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->frozen_read_reqs); + + return s; +} + +void fleecing_free(FleecingState *s) +{ + if (!s) { + return; + } + + bdrv_release_dirty_bitmap(s->access_bitmap); + bdrv_release_dirty_bitmap(s->done_bitmap); + g_free(s); +} + +static BlockReq *add_read_req(FleecingState *s, uint64_t offset, uint64_t bytes) +{ + BlockReq *req = g_new(BlockReq, 1); + + reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes); + + return req; +} + +static void drop_read_req(BlockReq *req) +{ + reqlist_remove_req(req); + g_free(req); +} + +int fleecing_read_lock(FleecingState *s, int64_t offset, + int64_t bytes, const BlockReq **req, + int64_t *pnum) +{ + bool done; + + QEMU_LOCK_GUARD(&s->lock); + + if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) { + return -EACCES; + } + + bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, &done, pnum); + if (!done) { + *req = add_read_req(s, offset, *pnum); + } + + return 0; +} + +void fleecing_read_unlock(FleecingState *s, const BlockReq *req) +{ + QEMU_LOCK_GUARD(&s->lock); + + drop_read_req((BlockReq *)req); +} + +void fleecing_discard(FleecingState *s, int64_t offset, int64_t bytes) +{ + WITH_QEMU_LOCK_GUARD(&s->lock) { + bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes); + } + + block_copy_reset(s->bcs, offset, bytes); +} + +void fleecing_mark_done_and_wait_readers(FleecingState *s, int64_t offset, + int64_t bytes) +{ + assert(QEMU_IS_ALIGNED(offset, block_copy_cluster_size(s->bcs))); + assert(QEMU_IS_ALIGNED(bytes, block_copy_cluster_size(s->bcs))); + + WITH_QEMU_LOCK_GUARD(&s->lock) { + bdrv_set_dirty_bitmap(s->done_bitmap, offset, bytes); + reqlist_wait_all(&s->frozen_read_reqs, offset, bytes, &s->lock); + } +} diff --git a/MAINTAINERS b/MAINTAINERS index 7f24ee4b92..78ea04e292 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2423,6 +2423,8 @@ F: block/reqlist.c F: include/block/reqlist.h F: block/copy-before-write.h F: block/copy-before-write.c +F: block/fleecing.h +F: block/fleecing.c F: include/block/aio_task.h F: block/aio_task.c F: util/qemu-co-shared-resource.c diff --git a/block/meson.build b/block/meson.build index 5065cf33ba..d30da90a01 100644 --- a/block/meson.build +++ b/block/meson.build @@ -18,6 +18,7 @@ block_ss.add(files( 'crypto.c', 'dirty-bitmap.c', 'filter-compress.c', + 'fleecing.c', 'io.c', 'mirror.c', 'nbd.c', -- 2.31.1