On Wed, Jul 25, 2012 at 6:00 AM, Bharata B Rao <bhar...@linux.vnet.ibm.com> wrote: > block: Support GlusterFS as a QEMU block backend. > > From: Bharata B Rao <bhar...@linux.vnet.ibm.com> > > This patch adds gluster as the new block backend in QEMU. This gives > QEMU the ability to boot VM images from gluster volumes. Its already > possible to boot from VM images on gluster volumes, but this patchset > provides the ability to boot VM images from gluster volumes by by-passing > the FUSE layer in gluster. In case the image is present on the local > system, it is possible to even bypass client and server translator and > hence the RPC overhead. > > VM Image on gluster volume is specified like this: > > -drive file=gluster:server:[port]:[transport]:volname:image > > - Here 'gluster' is the protocol. > - 'server' specifies the server where the volume file specification for > the given volume resides. > - 'port' is the port number on which gluster management daemon (glusterd) is > listening. This is optional and if not specified, QEMU will send 0 which > will make libgfapi to use the default port. > - 'transport' specifies the transport used to connect to glusterd. This is > optional and if not specified, socket transport is used. > - 'volname' is the name of the gluster volume which contains the VM image. > - 'image' is the path to the actual VM image in the gluster volume. > > Eg 1: -drive file=gluster:server1:0:socket:test:/image > Eg 2: -drive file=gluster:server1:::test:/image > > Signed-off-by: Bharata B Rao <bhar...@linux.vnet.ibm.com> > --- > > block/Makefile.objs | 1 > block/gluster.c | 484 > +++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 485 insertions(+), 0 deletions(-) > create mode 100644 block/gluster.c > > > diff --git a/block/Makefile.objs b/block/Makefile.objs > index b5754d3..a1ae67f 100644 > --- a/block/Makefile.objs > +++ b/block/Makefile.objs > @@ -9,3 +9,4 @@ block-obj-$(CONFIG_POSIX) += raw-posix.o > block-obj-$(CONFIG_LIBISCSI) += iscsi.o > block-obj-$(CONFIG_CURL) += curl.o > block-obj-$(CONFIG_RBD) += rbd.o > +block-obj-$(CONFIG_GLUSTERFS) += gluster.o > diff --git a/block/gluster.c b/block/gluster.c > new file mode 100644 > index 0000000..b27971b > --- /dev/null > +++ b/block/gluster.c > @@ -0,0 +1,484 @@ > +/* > + * GlusterFS backend for QEMU > + * > + * (AIO implementation is derived from block/rbd.c) > + * > + * Copyright (C) 2012 Bharata B Rao <bhar...@linux.vnet.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or > + * (at your option) any later version. See the COPYING file in the top-level > + * directory. > + */ > +#include "block_int.h" > +#include <glusterfs/api/glfs.h> > + > +typedef struct GlusterAIOCB { > + BlockDriverAIOCB common; > + bool canceled; > + int64_t size; > + int ret; > +} GlusterAIOCB; > + > +typedef struct BDRVGlusterState { > + struct glfs *glfs; > + int fds[2]; > + struct glfs_fd *fd; > + int qemu_aio_count; > +} BDRVGlusterState; > + > +#define GLUSTER_FD_READ 0 > +#define GLUSTER_FD_WRITE 1 > + > +typedef enum { > + GOPT_PROTOCOL, > + GOPT_SERVER, > + GOPT_PORT, > + GOPT_TRANSPORT, > + GOPT_VOLNAME, > + GOPT_IMAGE, > + > + GOPT_LAST, > +} GlusterOptsEnum; > + > +struct GlusterOpts {
static > + bool optional; > + char defval[10]; const char *defval? > + char *value; > +} GlusterOpts[] = { > + {false, "", NULL }, > + {false, "", NULL }, > + {true, "0", NULL }, > + {true, "socket", NULL }, > + {false, "", NULL }, > + {false, "", NULL }, > +}; > + > +static void qemu_gluster_opts_free(void) > +{ > + int i; > + > + for (i = 0; i < GOPT_LAST; i++) { > + g_free(GlusterOpts[i].value); > + /* Prepare GlusterOpts to parse the next gluster drive (if any) */ > + GlusterOpts[i].value = NULL; > + } > +} > + > +/* > + * file=protocol:server:[port]:[transport]:volname:image > + */ > +static int qemu_gluster_parsename(const char *filename) > +{ > + char *p, *q, *r; > + int ret = -EINVAL; > + int i; > + > + p = q = r = g_strdup(filename); > + for (i = 0; i < GOPT_LAST; i++) { > + q = p; > + p = strchr(p, ':'); > + if (!p) { > + goto out; > + } > + > + if (p == q) { > + if (GlusterOpts[i].optional) { > + GlusterOpts[i].value = g_strdup(GlusterOpts[i].defval); > + p++; > + continue; > + } else { > + goto out; > + } > + } > + *p++ = '\0'; > + GlusterOpts[i].value = g_strdup(q); > + } > +out: > + if (i == GOPT_LAST-1 && strlen(q)) { Spaces around '-'. > + GlusterOpts[i].value = g_strdup(q); > + ret = 0; > + } > + g_free(r); > + return ret; > +} > + > +static struct glfs *qemu_gluster_init(const char *filename) > +{ > + struct glfs *glfs = NULL; > + int ret; > + int port; > + > + ret = qemu_gluster_parsename(filename); > + if (ret < 0) { > + errno = -ret; > + goto out; > + } > + > + port = strtoul(GlusterOpts[GOPT_PORT].value, NULL, 0); > + if (port < 0) { port > 65535 could be bad too. > + goto out; > + } > + > + glfs = glfs_new(GlusterOpts[GOPT_VOLNAME].value); > + if (!glfs) { > + goto out; > + } > + > + ret = glfs_set_volfile_server(glfs, GlusterOpts[GOPT_TRANSPORT].value, > + GlusterOpts[GOPT_SERVER].value, port); > + if (ret < 0) { > + goto out; > + } > + > + /* > + * TODO: When GlusterFS exports logging.h, use GF_LOG_ERROR instead of > + * hard code value of 4 here. > + */ > + ret = glfs_set_logging(glfs, "-", 4); > + if (ret < 0) { > + goto out; > + } > + > + ret = glfs_init(glfs); > + if (ret < 0) { > + goto out; > + } > + return glfs; > + > +out: > + if (glfs) { > + glfs_fini(glfs); > + } > + return NULL; > +} > + > +static void qemu_gluster_complete_aio(GlusterAIOCB *acb) > +{ > + int ret; > + > + if (acb->canceled) { > + qemu_aio_release(acb); > + return; > + } > + > + if (acb->ret == acb->size) { > + ret = 0; /* Success */ > + } else if (acb->ret < 0) { > + ret = acb->ret; /* Read/Write failed */ > + } else { > + ret = -EIO; /* Partial read/write - fail it */ > + } > + acb->common.cb(acb->common.opaque, ret); > + qemu_aio_release(acb); > +} > + > +static void qemu_gluster_aio_event_reader(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + GlusterAIOCB *event_acb; > + int event_reader_pos = 0; > + ssize_t ret; > + > + do { > + char *p = (char *)&event_acb; > + > + ret = read(s->fds[GLUSTER_FD_READ], p + event_reader_pos, > + sizeof(event_acb) - event_reader_pos); > + if (ret > 0) { > + event_reader_pos += ret; > + if (event_reader_pos == sizeof(event_acb)) { > + event_reader_pos = 0; > + qemu_gluster_complete_aio(event_acb); > + s->qemu_aio_count--; > + } > + } > + } while (ret < 0 && errno == EINTR); > +} > + > +static int qemu_gluster_aio_flush_cb(void *opaque) > +{ > + BDRVGlusterState *s = opaque; > + > + return (s->qemu_aio_count > 0); > +} > + > +static int qemu_gluster_open(BlockDriverState *bs, const char *filename, > + int bdrv_flags) > +{ > + BDRVGlusterState *s = bs->opaque; > + int open_flags = 0; > + int ret = 0; > + > + s->glfs = qemu_gluster_init(filename); > + if (!s->glfs) { > + ret = -errno; > + goto out; > + } > + > + open_flags |= O_BINARY; > + open_flags &= ~O_ACCMODE; > + if (bdrv_flags & BDRV_O_RDWR) { > + open_flags |= O_RDWR; > + } else { > + open_flags |= O_RDONLY; > + } > + > + if ((bdrv_flags & BDRV_O_NOCACHE)) { > + open_flags |= O_DIRECT; > + } > + > + s->fd = glfs_open(s->glfs, GlusterOpts[GOPT_IMAGE].value, open_flags); > + if (!s->fd) { > + ret = -errno; > + goto out; > + } > + > + ret = qemu_pipe(s->fds); > + if (ret < 0) { > + goto out; > + } > + fcntl(s->fds[0], F_SETFL, O_NONBLOCK); > + fcntl(s->fds[1], F_SETFL, O_NONBLOCK); > + qemu_aio_set_fd_handler(s->fds[GLUSTER_FD_READ], > + qemu_gluster_aio_event_reader, NULL, qemu_gluster_aio_flush_cb, s); > + > +out: > + qemu_gluster_opts_free(); > + if (!ret) { > + return ret; > + } > + if (s->fd) { > + glfs_close(s->fd); > + } > + if (s->glfs) { > + glfs_fini(s->glfs); > + } > + return ret; > +} > + > +static int qemu_gluster_create(const char *filename, > + QEMUOptionParameter *options) > +{ > + struct glfs *glfs; > + struct glfs_fd *fd; > + int ret = 0; > + int64_t total_size = 0; > + > + glfs = qemu_gluster_init(filename); > + if (!glfs) { > + ret = -errno; > + goto out; > + } > + > + while (options && options->name) { > + if (!strcmp(options->name, BLOCK_OPT_SIZE)) { > + total_size = options->value.n / BDRV_SECTOR_SIZE; > + } > + options++; > + } > + > + fd = glfs_creat(glfs, GlusterOpts[GOPT_IMAGE].value, > + O_WRONLY|O_CREAT|O_TRUNC|O_BINARY, S_IRUSR|S_IWUSR); Spaces around '|'. > + if (!fd) { > + ret = -errno; > + } else { > + if (glfs_ftruncate(fd, total_size * BDRV_SECTOR_SIZE) != 0) { > + ret = -errno; > + } > + if (glfs_close(fd) != 0) { > + ret = -errno; > + } > + } > +out: > + qemu_gluster_opts_free(); > + if (glfs) { > + glfs_fini(glfs); > + } > + return ret; > +} > + > +static void qemu_gluster_aio_cancel(BlockDriverAIOCB *blockacb) > +{ > + GlusterAIOCB *acb = (GlusterAIOCB *)blockacb; > + > + acb->common.cb(acb->common.opaque, -ECANCELED); > + acb->canceled = true; > +} > + > +static AIOPool gluster_aio_pool = { > + .aiocb_size = sizeof(GlusterAIOCB), > + .cancel = qemu_gluster_aio_cancel, > +}; > + > +static int qemu_gluster_send_pipe(BDRVGlusterState *s, GlusterAIOCB *acb) > +{ > + int ret = 0; > + while (1) { > + fd_set wfd; > + int fd = s->fds[GLUSTER_FD_WRITE]; > + > + ret = write(fd, (void *)&acb, sizeof(acb)); > + if (ret >= 0) { > + break; > + } > + if (errno == EINTR) { > + continue; > + } > + if (errno != EAGAIN) { > + break; > + } > + > + FD_ZERO(&wfd); > + FD_SET(fd, &wfd); > + do { > + ret = select(fd + 1, NULL, &wfd, NULL, NULL); > + } while (ret < 0 && errno == EINTR); > + } > + return ret; > +} > + > +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) > +{ > + GlusterAIOCB *acb = (GlusterAIOCB *)arg; > + BDRVGlusterState *s = acb->common.bs->opaque; > + > + acb->ret = ret; > + if (qemu_gluster_send_pipe(s, acb) < 0) { > + error_report("Could not complete read/write/flush from gluster"); > + abort(); > + } > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_rw(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque, int write) > +{ > + int ret; > + GlusterAIOCB *acb; > + BDRVGlusterState *s = bs->opaque; > + size_t size; > + off_t offset; > + > + offset = sector_num * BDRV_SECTOR_SIZE; > + size = nb_sectors * BDRV_SECTOR_SIZE; > + s->qemu_aio_count++; > + > + acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque); > + acb->size = size; > + acb->ret = 0; > + acb->canceled = false; > + > + if (write) { > + ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, > + &gluster_finish_aiocb, acb); > + } else { > + ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, > + &gluster_finish_aiocb, acb); > + } > + > + if (ret < 0) { > + goto out; > + } > + return &acb->common; > + > +out: > + s->qemu_aio_count--; > + qemu_aio_release(acb); > + return NULL; > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_readv(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, > 0); > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_writev(BlockDriverState *bs, > + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + return qemu_gluster_aio_rw(bs, sector_num, qiov, nb_sectors, cb, opaque, > 1); > +} > + > +static BlockDriverAIOCB *qemu_gluster_aio_flush(BlockDriverState *bs, > + BlockDriverCompletionFunc *cb, void *opaque) > +{ > + int ret; > + GlusterAIOCB *acb; > + BDRVGlusterState *s = bs->opaque; > + > + acb = qemu_aio_get(&gluster_aio_pool, bs, cb, opaque); > + acb->size = 0; > + acb->ret = 0; > + acb->canceled = false; > + s->qemu_aio_count++; > + > + ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); > + if (ret < 0) { > + goto out; > + } > + return &acb->common; > + > +out: > + s->qemu_aio_count--; > + qemu_aio_release(acb); > + return NULL; > +} > + > +static int64_t qemu_gluster_getlength(BlockDriverState *bs) > +{ > + BDRVGlusterState *s = bs->opaque; > + struct stat st; > + int ret; > + > + ret = glfs_fstat(s->fd, &st); > + if (ret < 0) { > + return -errno; > + } else { > + return st.st_size; > + } > +} > + > +static void qemu_gluster_close(BlockDriverState *bs) > +{ > + BDRVGlusterState *s = bs->opaque; > + > + if (s->fd) { > + glfs_close(s->fd); > + s->fd = NULL; > + } > + glfs_fini(s->glfs); > +} > + > +static QEMUOptionParameter qemu_gluster_create_options[] = { > + { > + .name = BLOCK_OPT_SIZE, > + .type = OPT_SIZE, > + .help = "Virtual disk size" > + }, > + { NULL } > +}; > + > +static BlockDriver bdrv_gluster = { > + .format_name = "gluster", > + .protocol_name = "gluster", > + .instance_size = sizeof(BDRVGlusterState), > + .bdrv_file_open = qemu_gluster_open, > + .bdrv_close = qemu_gluster_close, > + .bdrv_create = qemu_gluster_create, > + .bdrv_getlength = qemu_gluster_getlength, > + > + .bdrv_aio_readv = qemu_gluster_aio_readv, > + .bdrv_aio_writev = qemu_gluster_aio_writev, > + .bdrv_aio_flush = qemu_gluster_aio_flush, > + > + .create_options = qemu_gluster_create_options, > +}; > + > +static void bdrv_gluster_init(void) > +{ > + bdrv_register(&bdrv_gluster); > +} > + > +block_init(bdrv_gluster_init); > >