Le Tuesday 04 Feb 2014 à 16:40:12 (+0100), Kevin Wolf a écrit : > Am 03.02.2014 um 22:51 hat Benoît Canet geschrieben: > > From: Benoît Canet <ben...@irqsave.net> > > > > Use gnutls's SHA-256 to compare versions. > > > > Signed-off-by: Benoit Canet <ben...@irqsave.net> > > --- > > block/Makefile.objs | 2 +- > > block/quorum.c | 386 > > +++++++++++++++++++++++++++++++++++++++++++++- > > configure | 36 +++++ > > docs/qmp/qmp-events.txt | 33 ++++ > > include/monitor/monitor.h | 2 + > > monitor.c | 2 + > > 6 files changed, 458 insertions(+), 3 deletions(-) > > > > diff --git a/block/Makefile.objs b/block/Makefile.objs > > index a2650b9..4ca9d43 100644 > > --- a/block/Makefile.objs > > +++ b/block/Makefile.objs > > @@ -3,7 +3,7 @@ block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o > > qcow2-snapshot.o qcow2-c > > block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o > > block-obj-y += qed-check.o > > block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o > > -block-obj-y += quorum.o > > +block-obj-$(CONFIG_QUORUM) += quorum.o > > block-obj-y += parallels.o blkdebug.o blkverify.o > > block-obj-y += snapshot.o qapi.o > > block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o > > diff --git a/block/quorum.c b/block/quorum.c > > index 699b512..837d261 100644 > > --- a/block/quorum.c > > +++ b/block/quorum.c > > @@ -13,7 +13,43 @@ > > * See the COPYING file in the top-level directory. > > */ > > > > +#include <gnutls/gnutls.h> > > +#include <gnutls/crypto.h> > > #include "block/block_int.h" > > +#include "qapi/qmp/qjson.h" > > + > > +#define HASH_LENGTH 32 > > + > > +/* This union holds a vote hash value */ > > +typedef union QuorumVoteValue { > > + char h[HASH_LENGTH]; /* SHA-256 hash */ > > + int64_t l; /* simpler 64 bits hash */ > > +} QuorumVoteValue; > > + > > +/* A vote item */ > > +typedef struct QuorumVoteItem { > > + int index; > > + QLIST_ENTRY(QuorumVoteItem) next; > > +} QuorumVoteItem; > > + > > +/* this structure is a vote version. A version is the set of votes sharing > > the > > + * same vote value. > > + * The set of votes will be tracked with the items field and its > > cardinality is > > + * vote_count. > > + */ > > +typedef struct QuorumVoteVersion { > > + QuorumVoteValue value; > > + int index; > > + int vote_count; > > + QLIST_HEAD(, QuorumVoteItem) items; > > + QLIST_ENTRY(QuorumVoteVersion) next; > > +} QuorumVoteVersion; > > + > > +/* this structure holds a group of vote versions together */ > > +typedef struct QuorumVotes { > > + QLIST_HEAD(, QuorumVoteVersion) vote_list; > > + int (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); > > +} QuorumVotes; > > > > /* the following structure holds the state of one quorum instance */ > > typedef struct { > > @@ -60,10 +96,14 @@ struct QuorumAIOCB { > > int success_count; /* number of successfully completed AIOCB > > */ > > bool *finished; /* completion signal for cancel */ > > > > + QuorumVotes votes; > > + > > bool is_read; > > int vote_ret; > > }; > > > > +static void quorum_vote(QuorumAIOCB *acb); > > + > > static void quorum_aio_cancel(BlockDriverAIOCB *blockacb) > > { > > QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); > > @@ -81,10 +121,12 @@ static AIOCBInfo quorum_aiocb_info = { > > .cancel = quorum_aio_cancel, > > }; > > > > +static int quorum_vote_error(QuorumAIOCB *acb); > > + > > What's the reason for putting the forward declaration here? This is > neither directly before the first user nor at the top. > > In fact, the next occurence of quorum_vote_error() is the implementation > of the function, so the forward declaration is completely unnecessary. > > > static void quorum_aio_finalize(QuorumAIOCB *acb) > > { > > BDRVQuorumState *s = acb->bqs; > > - int ret = 0; > > + int i, ret = 0; > > > > for (i = 0; i < s->total; i++) { > > qemu_vfree(acb->aios[i].buf); > > @@ -92,6 +134,10 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > > acb->aios[i].ret = 0; > > } > > > > + if (acb->vote_ret) { > > + ret = acb->vote_ret; > > + } > > + > > acb->common.cb(acb->common.opaque, ret); > > if (acb->finished) { > > *acb->finished = true; > > @@ -103,6 +149,27 @@ static void quorum_aio_finalize(QuorumAIOCB *acb) > > qemu_aio_release(acb); > > } > > > > +static int quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) > > +{ > > + return memcmp(a->h, b->h, HASH_LENGTH); > > +} > > + > > +static int quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) > > +{ > > + int64_t i = a->l; > > + int64_t j = b->l; > > + > > + if (i < j) { > > + return -1; > > + } > > + > > + if (i > j) { > > + return 1; > > + } > > + > > + return 0; > > +} > > The usual way to implement this is 'return a->l - b->l;', because if you > expect memcmp() to return a valid value for the compare function you > can't assume that it's normalised to -1/0/1 anyway. > > As you only ever use the result as a bool, you could alternatively > even declare the function as such and do 'return a->l != b->l;'. > > > static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > > BlockDriverState *bs, > > QEMUIOVector *qiov, > > @@ -122,6 +189,7 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, > > acb->count = 0; > > acb->success_count = 0; > > acb->finished = NULL; > > + acb->votes.compare = quorum_sha256_compare; > > acb->is_read = false; > > acb->vote_ret = 0; > > > > You need to initialise votes.vote_list as well. > > > @@ -151,9 +219,323 @@ static void quorum_aio_cb(void *opaque, int ret) > > return; > > } > > > > + /* Do the vote on read */ > > + if (acb->is_read) { > > + quorum_vote(acb); > > + } > > + > > quorum_aio_finalize(acb); > > } > > > > +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name) > > +{ > > + QObject *data; > > + data = qobject_from_jsonf("{ 'node-name': \"%s\"" > > + ", 'sector-num': %" PRId64 > > + ", 'sectors-count': %i }", > > + node_name, > > Can't node_name be NULL here?
No node_name is a member of BlockDriverState so the only thing that could happen is bs->node_name[0] == '\0' Yet I will add a security. > > > + acb->sector_num, > > + acb->nb_sectors); > > + monitor_protocol_event(QEVENT_QUORUM_REPORT_BAD, data); > > + qobject_decref(data); > > +} > > + > > +static void quorum_report_failure(QuorumAIOCB *acb) > > +{ > > + QObject *data; > > + data = qobject_from_jsonf("{ 'sector-num': %" PRId64 > > + ", 'sectors-count': %i }", > > + acb->sector_num, > > + acb->nb_sectors); > > If I have multiple quorum devices, this event doesn't tell me, which one > it is about. > > > + monitor_protocol_event(QEVENT_QUORUM_FAILURE, data); > > + qobject_decref(data); > > +} > > + > > +static void quorum_report_bad_versions(BDRVQuorumState *s, > > + QuorumAIOCB *acb, > > + QuorumVoteValue *value) > > +{ > > + QuorumVoteVersion *version; > > + QuorumVoteItem *item; > > + > > + QLIST_FOREACH(version, &acb->votes.vote_list, next) { > > + if (!acb->votes.compare(&version->value, value)) { > > + continue; > > + } > > + QLIST_FOREACH(item, &version->items, next) { > > + quorum_report_bad(acb, s->bs[item->index]->node_name); > > + } > > + } > > +} > > + > > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > > +{ > > + int i; > > + assert(dest->niov == source->niov); > > + assert(dest->size == source->size); > > + for (i = 0; i < source->niov; i++) { > > + assert(dest->iov[i].iov_len == source->iov[i].iov_len); > > + memcpy(dest->iov[i].iov_base, > > + source->iov[i].iov_base, > > + source->iov[i].iov_len); > > + } > > +} > > + > > +static void quorum_count_vote(QuorumVotes *votes, > > + QuorumVoteValue *value, > > + int index) > > +{ > > + QuorumVoteVersion *v = NULL, *version = NULL; > > + QuorumVoteItem *item; > > + > > + /* look if we have something with this hash */ > > + QLIST_FOREACH(v, &votes->vote_list, next) { > > + if (!votes->compare(&v->value, value)) { > > + version = v; > > + break; > > + } > > + } > > + > > + /* It's a version not yet in the list add it */ > > + if (!version) { > > + version = g_new0(QuorumVoteVersion, 1); > > + QLIST_INIT(&version->items); > > + memcpy(&version->value, value, sizeof(version->value)); > > + version->index = index; > > + version->vote_count = 0; > > + QLIST_INSERT_HEAD(&votes->vote_list, version, next); > > + } > > + > > + version->vote_count++; > > + > > + item = g_new0(QuorumVoteItem, 1); > > + item->index = index; > > + QLIST_INSERT_HEAD(&version->items, item, next); > > +} > > + > > +static void quorum_free_vote_list(QuorumVotes *votes) > > +{ > > + QuorumVoteVersion *version, *next_version; > > + QuorumVoteItem *item, *next_item; > > + > > + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { > > + QLIST_REMOVE(version, next); > > + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { > > + QLIST_REMOVE(item, next); > > + g_free(item); > > + } > > + g_free(version); > > + } > > +} > > + > > +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue > > *hash) > > +{ > > + int j, ret; > > + gnutls_hash_hd_t dig; > > + QEMUIOVector *qiov = &acb->aios[i].qiov; > > + > > + ret = gnutls_hash_init(&dig, GNUTLS_DIG_SHA256); > > + > > + if (ret < 0) { > > + return ret; > > + } > > + > > + for (j = 0; j < qiov->niov; j++) { > > + ret = gnutls_hash(dig, qiov->iov[j].iov_base, > > qiov->iov[j].iov_len); > > + if (ret < 0) { > > + break; > > + } > > + } > > + > > + gnutls_hash_deinit(dig, (void *) hash); > > + return ret; > > +} > > + > > +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) > > +{ > > + int i = 0; > > I like obvious variable names. This must be a loop counter. > > > + QuorumVoteVersion *candidate, *winner = NULL; > > + > > + QLIST_FOREACH(candidate, &votes->vote_list, next) { > > + if (candidate->vote_count > i) { > > + i = candidate->vote_count; > > Wait, what? This doesn't quite look like a loop. > > > + winner = candidate; > > + } > > + } > > + > > + return winner; > > +} > > + > > +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) > > +{ > > + int i; > > + int result; > > + > > + assert(a->niov == b->niov); > > + for (i = 0; i < a->niov; i++) { > > + assert(a->iov[i].iov_len == b->iov[i].iov_len); > > + result = memcmp(a->iov[i].iov_base, > > + b->iov[i].iov_base, > > + a->iov[i].iov_len); > > + if (result) { > > + return false; > > + } > > + } > > + > > + return true; > > +} > > I thought we introduced qemu_iovec_compare() earlier in this series to > do exactly this, except more generically? > > I see that you call one or the other depending on whether we're running > in blkverify mode, but what is the difference in the semantics? Either > both are the same and there is no reason to have both, or one of them > must have non-obvious semantics and lacks proper documentation. > > > +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, > > + const char *fmt, ...) > > +{ > > + va_list ap; > > + > > + va_start(ap, fmt); > > + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", > > + acb->sector_num, acb->nb_sectors); > > + vfprintf(stderr, fmt, ap); > > + fprintf(stderr, "\n"); > > + va_end(ap); > > + exit(1); > > +} > > + > > +static bool quorum_compare(QuorumAIOCB *acb, > > + QEMUIOVector *a, > > + QEMUIOVector *b) > > +{ > > + BDRVQuorumState *s = acb->bqs; > > + ssize_t offset; > > + > > + /* This driver will replace blkverify in this particular case */ > > + if (s->is_blkverify) { > > + offset = qemu_iovec_compare(a, b); > > + if (offset != -1) { > > + quorum_err(acb, "contents mismatch in sector %" PRId64, > > + acb->sector_num + > > + (uint64_t)(offset / BDRV_SECTOR_SIZE)); > > + } > > + return true; > > + } > > + > > + return quorum_iovec_compare(a, b); > > +} > > + > > +/* Do a vote to get the error code */ > > +static int quorum_vote_error(QuorumAIOCB *acb) > > +{ > > + BDRVQuorumState *s = acb->bqs; > > + QuorumVoteVersion *winner = NULL; > > + QuorumVotes error_votes; > > + QuorumVoteValue result_value; > > + int i, ret = 0; > > + bool error = false; > > + > > + QLIST_INIT(&error_votes.vote_list); > > + error_votes.compare = quorum_64bits_compare; > > + > > + for (i = 0; i < s->total; i++) { > > + ret = acb->aios[i].ret; > > + if (ret) { > > + error = true; > > + result_value.l = ret; > > + quorum_count_vote(&error_votes, &result_value, i); > > + } > > + } > > + > > + if (error) { > > + winner = quorum_get_vote_winner(&error_votes); > > + ret = winner->value.l; > > + } > > + > > + quorum_free_vote_list(&error_votes); > > + > > + return ret; > > +} > > + > > +static void quorum_vote(QuorumAIOCB *acb) > > +{ > > + bool quorum = false; > > + int i, j, ret; > > + QuorumVoteValue hash; > > + BDRVQuorumState *s = acb->bqs; > > + QuorumVoteVersion *winner; > > + > > + QLIST_INIT(&acb->votes.vote_list); > > + > > + /* if we don't get enough successful read use the first error code */ > > + if (acb->success_count < s->threshold) { > > + acb->vote_ret = quorum_vote_error(acb); > > + quorum_report_failure(acb); > > + return; > > + } > > + > > + /* get the index of the first successful read (we are sure to find > > one) */ > > + for (i = 0; i < s->total; i++) { > > + if (!acb->aios[i].ret) { > > + break; > > + } > > + } > > "we are sure to find one" is spelt "assert(i < s->total);" > > > + > > + /* compare this read with all other successful read looking for quorum > > */ > > + for (j = i + 1; j < s->total; j++) { > > + if (acb->aios[j].ret) { > > + continue; > > + } > > + quorum = quorum_compare(acb, &acb->aios[i].qiov, > > &acb->aios[j].qiov); > > + if (!quorum) { > > + break; > > + } > > + } > > + > > + /* Every successful read agrees and their count is higher or equal > > threshold > > + * -> Quorum > > + */ > > + if (quorum && acb->success_count >= s->threshold) { > > + quorum_copy_qiov(acb->qiov, &acb->aios[i].qiov); > > + return; > > + } > > For threshold == success_count == 1, the condition in the comment is > fulfilled, but the one in the code isn't. > > > + > > + /* compute hashs for each successful read, also store indexes */ > > + for (i = 0; i < s->total; i++) { > > + if (acb->aios[i].ret) { > > + continue; > > + } > > + ret = quorum_compute_hash(acb, i, &hash); > > + /* if ever the hash computation failed */ > > + if (ret < 0) { > > + acb->vote_ret = ret; > > + goto free_exit; > > + } > > + quorum_count_vote(&acb->votes, &hash, i); > > + } > > + > > + /* vote to select the most represented version */ > > + winner = quorum_get_vote_winner(&acb->votes); > > + /* every vote version are differents -> error */ > > + if (!winner) { > > Can this happen? This means that there was no vote at all. > > > + quorum_report_failure(acb); > > + acb->vote_ret = -EIO; > > + goto free_exit; > > + } > > + > > + /* if the winner count is smaller than threshold the read fails */ > > + if (winner->vote_count < s->threshold) { > > + quorum_report_failure(acb); > > + acb->vote_ret = -EIO; > > + goto free_exit; > > + } > > + > > + /* we have a winner: copy it */ > > + quorum_copy_qiov(acb->qiov, &acb->aios[winner->index].qiov); > > + > > + /* some versions are bad print them */ > > + quorum_report_bad_versions(s, acb, &winner->value); > > + > > +free_exit: > > + /* free lists */ > > + quorum_free_vote_list(&acb->votes); > > +} > > + > > static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > > int64_t sector_num, > > QEMUIOVector *qiov, > > @@ -175,7 +557,7 @@ static BlockDriverAIOCB > > *quorum_aio_readv(BlockDriverState *bs, > > } > > > > for (i = 0; i < s->total; i++) { > > - bdrv_aio_readv(s->bs[i], sector_num, qiov, nb_sectors, > > + bdrv_aio_readv(s->bs[i], sector_num, &acb->aios[i].qiov, > > nb_sectors, > > quorum_aio_cb, &acb->aios[i]); > > } > > Why don't you do this from the beginning? > > Kevin