This allow VM continues to process even if some devices are broken meanwhile with proper configuration.
We mark the device broken when the protocol tier notify back some broken state(s) of the device, such as diconnection via driver operations. We could also reset the device as sound when the protocol tier is repaired. Origianlly .threshold controls how we should decide the success of read/write and return the failure only if the success count of read/write is less than .threshold specified by users. But it doesn't record the states of underlying states and will impact performance a bit in some cases. For example, we have 3 children and .threshold is set 2. If one of the devices broken, we should still return success and continue to run VM. But for every IO operations, we will blindly send the requests to the broken device. To store broken state into driver state we can save requests to borken devices and resend the requests to the repaired ones by setting broken as false. This is especially useful for network based protocol such as sheepdog, which has a auto-reconnection mechanism and will never report EIO if the connection is broken but just store the requests to its local queue and wait for resending. Without broken state, quorum request will not come back until the connection is re-established. So we have to skip the broken deivces to allow VM to continue running with networked backed child (glusterfs, nfs, sheepdog, etc). With the combination of read-pattern and threshold, we can easily mimic the DRVD behavior with following configuration: read-pattern=fifo,threshold=1 will two children. Cc: Eric Blake <ebl...@redhat.com> Cc: Benoit Canet <ben...@irqsave.net> Cc: Kevin Wolf <kw...@redhat.com> Cc: Stefan Hajnoczi <stefa...@redhat.com> Signed-off-by: Liu Yuan <namei.u...@gmail.com> --- block/quorum.c | 102 ++++++++++++++++++++++++++++++++++++++-------- include/block/block_int.h | 3 ++ 2 files changed, 87 insertions(+), 18 deletions(-) diff --git a/block/quorum.c b/block/quorum.c index b9eeda3..7b07e35 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -120,6 +120,7 @@ struct QuorumAIOCB { int rewrite_count; /* number of replica to rewrite: count down to * zero once writes are fired */ + int issued_count; /* actual read&write issued count */ QuorumVotes votes; @@ -170,8 +171,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) if (acb->is_read) { /* on the quorum case acb->child_iter == s->num_children - 1 */ for (i = 0; i <= acb->child_iter; i++) { - qemu_vfree(acb->qcrs[i].buf); - qemu_iovec_destroy(&acb->qcrs[i].qiov); + if (acb->qcrs[i].buf) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } } } @@ -207,6 +210,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, acb->count = 0; acb->success_count = 0; acb->rewrite_count = 0; + acb->issued_count = 0; acb->votes.compare = quorum_sha256_compare; QLIST_INIT(&acb->votes.vote_list); acb->is_read = false; @@ -286,6 +290,22 @@ static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) } } +static int next_fifo_child(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + int i; + + for (i = acb->child_iter; i < s->num_children; i++) { + if (!s->bs[i]->broken) { + break; + } + } + if (i == s->num_children) { + return -1; + } + return i; +} + static void quorum_aio_cb(void *opaque, int ret) { QuorumChildRequest *sacb = opaque; @@ -293,11 +313,18 @@ static void quorum_aio_cb(void *opaque, int ret) BDRVQuorumState *s = acb->common.bs->opaque; bool rewrite = false; + if (ret < 0) { + s->bs[acb->child_iter]->broken = true; + } + if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { /* We try to read next child in FIFO order if we fail to read */ - if (ret < 0 && ++acb->child_iter < s->num_children) { - read_fifo_child(acb); - return; + if (ret < 0) { + acb->child_iter = next_fifo_child(acb); + if (acb->child_iter > 0) { + read_fifo_child(acb); + return; + } } if (ret == 0) { @@ -315,9 +342,9 @@ static void quorum_aio_cb(void *opaque, int ret) } else { quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); } - assert(acb->count <= s->num_children); - assert(acb->success_count <= s->num_children); - if (acb->count < s->num_children) { + assert(acb->count <= acb->issued_count); + assert(acb->success_count <= acb->issued_count); + if (acb->count < acb->issued_count) { return; } @@ -647,22 +674,46 @@ free_exit: return rewrite; } +static bool has_enough_children(BDRVQuorumState *s) +{ + int good = 0, i; + + for (i = 0; i < s->num_children; i++) { + if (s->bs[i]->broken) { + continue; + } + good++; + } + + if (good >= s->threshold) { + return true; + } else { + return false; + } +} + static BlockDriverAIOCB *read_quorum_children(QuorumAIOCB *acb) { BDRVQuorumState *s = acb->common.bs->opaque; int i; + if (!has_enough_children(s)) { + quorum_aio_release(acb); + return NULL; + } + for (i = 0; i < s->num_children; i++) { + if (s->bs[i]->broken) { + continue; + } acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size); qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); - } - - for (i = 0; i < s->num_children; i++) { acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); + acb->issued_count++; } return &acb->common; @@ -679,7 +730,7 @@ static BlockDriverAIOCB *read_fifo_child(QuorumAIOCB *acb) acb->qcrs[i].aiocb = bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); - + acb->issued_count = 1; return &acb->common; } @@ -693,6 +744,7 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, BDRVQuorumState *s = bs->opaque; QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, cb, opaque); + acb->is_read = true; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { @@ -701,6 +753,11 @@ static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, } acb->child_iter = 0; + acb->child_iter = next_fifo_child(acb); + if (acb->child_iter < 0) { + quorum_aio_release(acb); + return NULL; + } return read_fifo_child(acb); } @@ -716,10 +773,19 @@ static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, cb, opaque); int i; + if (!has_enough_children(s)) { + quorum_aio_release(acb); + return NULL; + } + for (i = 0; i < s->num_children; i++) { + if (s->bs[i]->broken) { + continue; + } acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, nb_sectors, &quorum_aio_cb, &acb->qcrs[i]); + acb->issued_count++; } return &acb->common; @@ -926,6 +992,12 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, } s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); + /* and validate it against s->num_children */ + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); + if (ret < 0) { + goto exit; + } + ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); if (ret < 0) { error_setg(&local_err, "Please set read-pattern as fifo or quorum\n"); @@ -934,12 +1006,6 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, s->read_pattern = ret; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - /* and validate it against s->num_children */ - ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); - if (ret < 0) { - goto exit; - } - /* is the driver in blkverify mode */ if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && s->num_children == 2 && s->threshold == 2) { diff --git a/include/block/block_int.h b/include/block/block_int.h index 9fdec7f..599110b 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -402,6 +402,9 @@ struct BlockDriverState { /* The error object in use for blocking operations on backing_hd */ Error *backing_blocker; + + /* True if the associated device is broken */ + bool broken; }; int get_tmp_filename(char *filename, int size); -- 1.9.1