The Tuesday 11 Mar 2014 à 20:58:06 (+0100), Max Reitz wrote : > On 11.03.2014 17:36, Benoît Canet wrote: > >On read operations when this parameter is set and some replicas are corrupted > >while quorum can be reached quorum will proceed to rewrite the correct > >version > >of the data to fix the corrupted replicas. > > > >This will shine with SSD where the FTL will remap the same block at another > >place on rewrite. > > > >Signed-off-by: Benoit Canet <ben...@irqsave.net> > >--- > > block/quorum.c | 97 > > +++++++++++++++++++++++++++++++++++++++++++--- > > qapi-schema.json | 5 ++- > > tests/qemu-iotests/081 | 14 +++++++ > > tests/qemu-iotests/081.out | 11 +++++- > > 4 files changed, 119 insertions(+), 8 deletions(-) > > > >diff --git a/block/quorum.c b/block/quorum.c > >index bd997b7..af4fd3c 100644 > >--- a/block/quorum.c > >+++ b/block/quorum.c > >@@ -22,6 +22,7 @@ > > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > > #define QUORUM_OPT_BLKVERIFY "blkverify" > >+#define QUORUM_OPT_REWRITE "rewrite-corrupted" > > /* This union holds a vote hash value */ > > typedef union QuorumVoteValue { > >@@ -69,6 +70,9 @@ typedef struct BDRVQuorumState { > > * It is useful to debug other block drivers by > > * comparing them with a reference one. > > */ > >+ bool rewrite_corrupted;/* true if the driver must rewrite-on-read > >corrupted > >+ * block if Quorum is reached. > >+ */ > > } BDRVQuorumState; > > typedef struct QuorumAIOCB QuorumAIOCB; > >@@ -104,13 +108,17 @@ struct QuorumAIOCB { > > int count; /* number of completed AIOCB */ > > int success_count; /* number of successfully completed AIOCB > > */ > >+ int rewrite_count; /* number of replica to rewrite: count down > >to > >+ * zero once writes are fired > >+ */ > >+ > > QuorumVotes votes; > > bool is_read; > > int vote_ret; > > }; > >-static void quorum_vote(QuorumAIOCB *acb); > >+static bool quorum_vote(QuorumAIOCB *acb); > > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > > { > >@@ -182,6 +190,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > > acb->qcrs = g_new0(QuorumChildRequest, s->num_children); > > acb->count = 0; > > acb->success_count = 0; > >+ acb->rewrite_count = 0; > > acb->votes.compare = quorum_sha256_compare; > > QLIST_INIT(&acb->votes.vote_list); > > acb->is_read = false; > >@@ -241,11 +250,27 @@ static bool quorum_has_too_much_io_failed(QuorumAIOCB > >*acb) > > return false; > > } > >+static void quorum_rewrite_aio_cb(void *opaque, int ret) > >+{ > >+ QuorumAIOCB *acb = opaque; > >+ > >+ /* one less rewrite to do */ > >+ acb->rewrite_count--; > >+ > >+ /* wait until all rewrite callback have completed */ > > *callbacks > > >+ if (acb->rewrite_count) { > >+ return; > >+ } > >+ > >+ quorum_aio_finalize(acb); > >+} > >+ > > static void quorum_aio_cb(void *opaque, int ret) > > { > > QuorumChildRequest *sacb = opaque; > > QuorumAIOCB *acb = sacb->parent; > > BDRVQuorumState *s = acb->common.bs->opaque; > >+ bool rewrite = false; > > sacb->ret = ret; > > acb->count++; > >@@ -262,12 +287,15 @@ static void quorum_aio_cb(void *opaque, int ret) > > /* Do the vote on read */ > > if (acb->is_read) { > >- quorum_vote(acb); > >+ rewrite = quorum_vote(acb); > > } else { > > quorum_has_too_much_io_failed(acb); > > } > >- quorum_aio_finalize(acb); > >+ /* if no rewrite is done the code will finish right away */ > >+ if (!rewrite) { > >+ quorum_aio_finalize(acb); > >+ } > > } > > static void quorum_report_bad_versions(BDRVQuorumState *s, > >@@ -287,6 +315,43 @@ static void quorum_report_bad_versions(BDRVQuorumState > >*s, > > } > > } > >+static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB > >*acb, > >+ QuorumVoteValue *value) > >+{ > >+ QuorumVoteVersion *version; > >+ QuorumVoteItem *item; > >+ int count = 0; > >+ > >+ /* first count the number of bad versions: done first to avoid > >concurency > > *concurrency > > >+ * issues. > >+ */ > >+ QLIST_FOREACH(version, &acb->votes.vote_list, next) { > >+ if (acb->votes.compare(&version->value, value)) { > >+ continue; > >+ } > >+ QLIST_FOREACH(item, &version->items, next) { > >+ count++; > >+ } > >+ } > >+ > >+ /* quorum_rewrite_aio_cb will count down this to zero */ > >+ acb->rewrite_count = count; > >+ > >+ /* now fire the correcting rewrites */ > >+ QLIST_FOREACH(version, &acb->votes.vote_list, next) { > >+ if (acb->votes.compare(&version->value, value)) { > >+ continue; > >+ } > >+ QLIST_FOREACH(item, &version->items, next) { > >+ bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, > >+ acb->nb_sectors, quorum_rewrite_aio_cb, acb); > >+ } > >+ } > >+ > >+ /* return true if any rewrite is done else false */ > >+ return count; > >+} > >+ > > static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > > { > > int i; > >@@ -477,16 +542,17 @@ static int quorum_vote_error(QuorumAIOCB *acb) > > return ret; > > } > >-static void quorum_vote(QuorumAIOCB *acb) > >+static bool quorum_vote(QuorumAIOCB *acb) > > { > > bool quorum = true; > >+ bool rewrite = false; > > int i, j, ret; > > QuorumVoteValue hash; > > BDRVQuorumState *s = acb->common.bs->opaque; > > QuorumVoteVersion *winner; > > if (quorum_has_too_much_io_failed(acb)) { > >- return; > >+ return false;; > > Two semicolons here. > > > } > > /* get the index of the first successful read */ > >@@ -514,7 +580,7 @@ static void quorum_vote(QuorumAIOCB *acb) > > /* Every successful read agrees */ > > if (quorum) { > > quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); > >- return; > >+ return false;; > > And here. > > > } > > /* compute hashes for each successful read, also store indexes */ > >@@ -547,9 +613,15 @@ static void quorum_vote(QuorumAIOCB *acb) > > /* some versions are bad print them */ > > quorum_report_bad_versions(s, acb, &winner->value); > >+ /* corruption correction is actived */ > > I'd vote for "enabled" rather than "actived". But I like "corruption > correction". :-) > > >+ if (s->rewrite_corrupted) { > >+ rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); > >+ } > >+ > > free_exit: > > /* free lists */ > > quorum_free_vote_list(&acb->votes); > >+ return rewrite; > > } > > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > >@@ -709,6 +781,11 @@ static QemuOptsList quorum_runtime_opts = { > > .type = QEMU_OPT_BOOL, > > .help = "Trigger block verify mode if set", > > }, > >+ { > >+ .name = QUORUM_OPT_REWRITE, > >+ .type = QEMU_OPT_BOOL, > >+ .help = "Rewrite corrupted block on read quorum", > >+ }, > > { /* end of list */ } > > }, > > }; > >@@ -770,6 +847,14 @@ static int quorum_open(BlockDriverState *bs, QDict > >*options, int flags, > > "and using two files with vote_threshold=2\n"); > > } > >+ s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, > >false); > >+ if (s->rewrite_corrupted && s->is_blkverify) { > >+ error_setg(&local_err, > >+ "rewrite-corrupted=on cannot be used with blkverify=on"); > >+ ret = -EINVAL; > >+ goto exit; > >+ } > >+ > > /* allocate the children BlockDriverState array */ > > s->bs = g_new0(BlockDriverState *, s->num_children); > > opened = g_new0(bool, s->num_children); > >diff --git a/qapi-schema.json b/qapi-schema.json > >index 6476d4a..f5a8767 100644 > >--- a/qapi-schema.json > >+++ b/qapi-schema.json > >@@ -4542,12 +4542,15 @@ > > # > > # @vote-threshold: the vote limit under which a read will fail > > # > >+# @rewrite-corrupted: #optional rewrite corrupted data when quorum is > >reached > >+# (Since 2.1) > >+# > > # Since: 2.0 > > ## > > { 'type': 'BlockdevOptionsQuorum', > > 'data': { '*blkverify': 'bool', > > 'children': [ 'BlockdevRef' ], > >- 'vote-threshold': 'int' } } > >+ 'vote-threshold': 'int', '*rewrite-corrupted': 'bool' } } > > ## > > # @BlockdevOptions > >diff --git a/tests/qemu-iotests/081 b/tests/qemu-iotests/081 > >index b512d00..bb211d6 100755 > >--- a/tests/qemu-iotests/081 > >+++ b/tests/qemu-iotests/081 > >@@ -95,6 +95,18 @@ echo "== checking quorum correction ==" > > $QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io > > echo > >+echo "== using quorum rewrite corrupted mode ==" > >+ > >+quorum="$quorum,file.rewrite-corrupted=on" > >+ > >+$QEMU_IO -c "open -o $quorum" -c "read -P 0x32 0 $size" | _filter_qemu_io > >+ > >+echo > >+echo "== checking that quorum has corrected the corrupted file ==" > >+ > >+$QEMU_IO -c "read -P 0x32 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io > >+ > > I'd prefer it if you could put these new test cases below the > "checking mixed reference/option specification" test, just because > I'd like that test to output the quorum warning - especially there > is no other point in the test where we can see that QMP message. > > But that's up to you and if you at least fix the double semicolons:
Ok I will do these changes. Thanks for the review Best regards Benoît > > Reviewed-by: Max Reitz <mre...@redhat.com> > > >+echo > > echo "== checking mixed reference/option specification ==" > > run_qemu -drive "file=$TEST_DIR/2.raw,format=$IMGFMT,if=none,id=drive2" > > <<EOF > >@@ -137,6 +149,8 @@ echo > > echo "== breaking quorum ==" > > $QEMU_IO -c "write -P 0x41 0 $size" "$TEST_DIR/1.raw" | _filter_qemu_io > >+$QEMU_IO -c "write -P 0x42 0 $size" "$TEST_DIR/2.raw" | _filter_qemu_io > >+ > > echo > > echo "== checking that quorum is broken ==" > >diff --git a/tests/qemu-iotests/081.out b/tests/qemu-iotests/081.out > >index 84aeb0c..2d8b290 100644 > >--- a/tests/qemu-iotests/081.out > >+++ b/tests/qemu-iotests/081.out > >@@ -25,12 +25,19 @@ wrote 10485760/10485760 bytes at offset 0 > > read 10485760/10485760 bytes at offset 0 > > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > >+== using quorum rewrite corrupted mode == > >+read 10485760/10485760 bytes at offset 0 > >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > >+ > >+== checking that quorum has corrected the corrupted file == > >+read 10485760/10485760 bytes at offset 0 > >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > >+ > > == checking mixed reference/option specification == > > Testing: -drive file=TEST_DIR/2.IMGFMT,format=IMGFMT,if=none,id=drive2 > > QMP_VERSION > > {"return": {}} > > {"return": {}} > >-{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": > >"QUORUM_REPORT_BAD", "data": {"node-name": "", "sectors-count": 20480, > >"sector-num": 0}} > > read 10485760/10485760 bytes at offset 0 > > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > > {"return": ""} > >@@ -43,6 +50,8 @@ read 10485760/10485760 bytes at offset 0 > > == breaking quorum == > > wrote 10485760/10485760 bytes at offset 0 > > 10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > >+wrote 10485760/10485760 bytes at offset 0 > >+10 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > > == checking that quorum is broken == > > qemu-io: can't open device (null): Could not read image for determining > > its format: Input/output error > >