The Friday 11 Jul 2014 à 11:01:22 (+0800), Liu Yuan wrote : > This patch adds single read pattern to quorum driver and quorum vote is > default > pattern. > > For now we do a quorum vote on all the reads, it is designed for unreliable > underlying storage such as non-redundant NFS to make sure data integrity at > the > cost of the read performance. > > For some use cases as following: > > VM > -------------- > | | > v v > A B > > Both A and B has hardware raid storage to justify the data integrity on its > own. > So it would help performance if we do a single read instead of on all the > nodes. > Further, if we run VM on either of the storage node, we can make a local read > request for better performance. > > This patch generalize the above 2 nodes case in the N nodes. That is, > > vm -> write to all the N nodes, read just one of them. If single read fails, > we > try to read next node in FIFO order specified by the startup command. > > The 2 nodes case is very similar to DRBD[1] though lack of auto-sync > functionality in the single device/node failure for now. But compared with > DRBD > we still have some advantages over it: > > - Suppose we have 20 VMs running on one(assume A) of 2 nodes' DRBD backed > storage. And if A crashes, we need to restart all the VMs on node B. But for > practice case, we can't because B might not have enough resources to setup 20 > VMs > at once. So if we run our 20 VMs with quorum driver, and scatter the > replicated > images over the data center, we can very likely restart 20 VMs without any > resource problem. > > After all, I think we can build a more powerful replicated image functionality > on quorum and block jobs(block mirror) to meet various High Availibility > needs. > > E.g, Enable single read pattern on 2 children, > > -drive driver=quorum,children.0.file.filename=0.qcow2,\ > children.1.file.filename=1.qcow2,read-pattern=single,vote-threshold=1 > > [1] http://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device > > Cc: Benoit Canet <ben...@irqsave.net> > Cc: Kevin Wolf <kw...@redhat.com> > Cc: Stefan Hajnoczi <stefa...@redhat.com> > Signed-off-by: Liu Yuan <namei.u...@gmail.com> > --- > block/quorum.c | 174 > +++++++++++++++++++++++++++++++++++++++++---------------- > 1 file changed, 125 insertions(+), 49 deletions(-) > > diff --git a/block/quorum.c b/block/quorum.c > index d5ee9c0..2f18755 100644 > --- a/block/quorum.c > +++ b/block/quorum.c > @@ -24,6 +24,7 @@ > #define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" > #define QUORUM_OPT_BLKVERIFY "blkverify" > #define QUORUM_OPT_REWRITE "rewrite-corrupted" > +#define QUORUM_OPT_READ_PATTERN "read-pattern" > > /* This union holds a vote hash value */ > typedef union QuorumVoteValue { > @@ -74,6 +75,16 @@ typedef struct BDRVQuorumState { > bool rewrite_corrupted;/* true if the driver must rewrite-on-read > corrupted > * block if Quorum is reached. > */ > + > +#define READ_PATTERN_QUORUM 0 /* default */ > +#define READ_PATTERN_SINGLE 1
Why not making these choices an enum ? Would READ_PATTERN_SINGLE be more accuratelly described by READ_PATTERN_ROUND_ROBIN ? More generaly would s/single/round-robin/ be more descriptive. > + int read_pattern; /* single: read a single child and try first one > + * first. If error, try next child in an > + * FIFO order specifed by command line. > + * Return error if no child read succeeds. > + * quorum: read all the children and do a quorum > + * vote on reads. > + */ > } BDRVQuorumState; > > typedef struct QuorumAIOCB QuorumAIOCB; > @@ -117,6 +128,7 @@ struct QuorumAIOCB { > > bool is_read; > int vote_ret; > + int child_iter; /* which child to read in single pattern */ > }; > > static bool quorum_vote(QuorumAIOCB *acb); > @@ -256,6 +268,21 @@ static void quorum_rewrite_aio_cb(void *opaque, int ret) > quorum_aio_finalize(acb); > } > > +static BlockDriverAIOCB *read_single_child(QuorumAIOCB *acb); > + > +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > +{ > + int i; > + assert(dest->niov == source->niov); > + assert(dest->size == source->size); > + for (i = 0; i < source->niov; i++) { > + assert(dest->iov[i].iov_len == source->iov[i].iov_len); > + memcpy(dest->iov[i].iov_base, > + source->iov[i].iov_base, > + source->iov[i].iov_len); > + } > +} > + > static void quorum_aio_cb(void *opaque, int ret) > { > QuorumChildRequest *sacb = opaque; > @@ -263,6 +290,21 @@ static void quorum_aio_cb(void *opaque, int ret) > BDRVQuorumState *s = acb->common.bs->opaque; > bool rewrite = false; > > + if (acb->is_read && s->read_pattern == READ_PATTERN_SINGLE) { > + /* We try to read next child in FIFO order if we fail to read */ > + if (ret < 0 && ++acb->child_iter < s->num_children) { > + read_single_child(acb); Here the previous failed read is never finalized/freed. > + return; > + } > + > + if (ret == 0) { > + quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); > + } > + acb->vote_ret = ret; > + quorum_aio_finalize(acb); > + return; > + } > + > sacb->ret = ret; > acb->count++; > if (ret == 0) { > @@ -276,7 +318,6 @@ static void quorum_aio_cb(void *opaque, int ret) > return; > } > > - /* Do the vote on read */ Why removing this comment ? > if (acb->is_read) { > rewrite = quorum_vote(acb); > } else { > @@ -343,19 +384,6 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState > *s, QuorumAIOCB *acb, > return count; > } > > -static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) > -{ > - int i; > - assert(dest->niov == source->niov); > - assert(dest->size == source->size); > - for (i = 0; i < source->niov; i++) { > - assert(dest->iov[i].iov_len == source->iov[i].iov_len); > - memcpy(dest->iov[i].iov_base, > - source->iov[i].iov_base, > - source->iov[i].iov_len); > - } > -} > - > static void quorum_count_vote(QuorumVotes *votes, > QuorumVoteValue *value, > int index) > @@ -615,34 +643,61 @@ free_exit: > return rewrite; > } > > -static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > - int64_t sector_num, > - QEMUIOVector *qiov, > - int nb_sectors, > - BlockDriverCompletionFunc *cb, > - void *opaque) > +static BlockDriverAIOCB *read_all_children(QuorumAIOCB *acb) > { > - BDRVQuorumState *s = bs->opaque; > - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, > - nb_sectors, cb, opaque); > + BDRVQuorumState *s = acb->common.bs->opaque; > int i; > > - acb->is_read = true; > - > for (i = 0; i < s->num_children; i++) { > - acb->qcrs[i].buf = qemu_blockalign(s->bs[i], qiov->size); > - qemu_iovec_init(&acb->qcrs[i].qiov, qiov->niov); > - qemu_iovec_clone(&acb->qcrs[i].qiov, qiov, acb->qcrs[i].buf); > + acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size); > + qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); > + qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); > } > > for (i = 0; i < s->num_children; i++) { > - bdrv_aio_readv(s->bs[i], sector_num, &acb->qcrs[i].qiov, nb_sectors, > - quorum_aio_cb, &acb->qcrs[i]); > + bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, > + acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); > } > > return &acb->common; > } > > +static BlockDriverAIOCB *read_single_child(QuorumAIOCB *acb) > +{ > + BDRVQuorumState *s = acb->common.bs->opaque; > + > + acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter], > + acb->qiov->size); > + qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); > + qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, > + acb->qcrs[acb->child_iter].buf); > + bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num, > + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, > + quorum_aio_cb, &acb->qcrs[acb->child_iter]); > + > + return &acb->common; > +} > + > +static BlockDriverAIOCB *quorum_aio_readv(BlockDriverState *bs, > + int64_t sector_num, > + QEMUIOVector *qiov, > + int nb_sectors, > + BlockDriverCompletionFunc *cb, > + void *opaque) > +{ > + BDRVQuorumState *s = bs->opaque; > + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, > + nb_sectors, cb, opaque); > + acb->is_read = true; > + > + if (s->read_pattern == READ_PATTERN_QUORUM) { > + return read_all_children(acb); > + } > + > + acb->child_iter = 0; > + return read_single_child(acb); > +} > + > static BlockDriverAIOCB *quorum_aio_writev(BlockDriverState *bs, > int64_t sector_num, > QEMUIOVector *qiov, > @@ -782,6 +837,11 @@ static QemuOptsList quorum_runtime_opts = { > .type = QEMU_OPT_BOOL, > .help = "Rewrite corrupted block on read quorum", > }, > + { > + .name = QUORUM_OPT_READ_PATTERN, > + .type = QEMU_OPT_STRING, > + .help = "Allowed pattern: quorum, single. Quorum is default", > + }, > { /* end of list */ } > }, > }; > @@ -796,6 +856,7 @@ static int quorum_open(BlockDriverState *bs, QDict > *options, int flags, > QDict *sub = NULL; > QList *list = NULL; > const QListEntry *lentry; > + const char *read_pattern; > int i; > int ret = 0; > > @@ -827,28 +888,43 @@ static int quorum_open(BlockDriverState *bs, QDict > *options, int flags, > } > > s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); > - > - /* and validate it against s->num_children */ > - ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); > - if (ret < 0) { > - goto exit; > + read_pattern = qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN); > + if (read_pattern) { > + if (strcmp(read_pattern, "single") == 0) { > + s->read_pattern = READ_PATTERN_SINGLE; > + } else if (strcmp(read_pattern, "quorum") == 0) { > + s->read_pattern = READ_PATTERN_QUORUM; > + } else { > + error_setg(&local_err, > + "Please set read-pattern as single or quorum\n"); > + ret = -EINVAL; > + goto exit; > + } > } > > - /* is the driver in blkverify mode */ > - if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > - s->num_children == 2 && s->threshold == 2) { > - s->is_blkverify = true; > - } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { > - fprintf(stderr, "blkverify mode is set by setting blkverify=on " > - "and using two files with vote_threshold=2\n"); > - } > + if (s->read_pattern == READ_PATTERN_QUORUM) { > + /* and validate it against s->num_children */ > + ret = quorum_valid_threshold(s->threshold, s->num_children, > &local_err); > + if (ret < 0) { > + goto exit; > + } > > - s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, > false); > - if (s->rewrite_corrupted && s->is_blkverify) { > - error_setg(&local_err, > - "rewrite-corrupted=on cannot be used with blkverify=on"); > - ret = -EINVAL; > - goto exit; > + /* is the driver in blkverify mode */ > + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && > + s->num_children == 2 && s->threshold == 2) { > + s->is_blkverify = true; > + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { > + fprintf(stderr, "blkverify mode is set by setting blkverify=on " > + "and using two files with vote_threshold=2\n"); > + } > + > + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, > false); > + if (s->rewrite_corrupted && s->is_blkverify) { > + error_setg(&local_err, > + "rewrite-corrupted=on cannot be used with > blkverify=on"); > + ret = -EINVAL; > + goto exit; > + } > } > > /* allocate the children BlockDriverState array */ > -- > 1.9.1 >