>> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present >> in case we later want to add some functionality to it. I have now >> added a comment to this affect to avoid any confusion. >> > > The problem is you don't know which version of the qnio library any given > QEMU binary will be using, since it is a shared library. Future versions > may implement the flush ioctl as expressed above, in which case we may hide > a valid error. > > Am I correct in assuming that this call suppresses errors because an error > is returned for an unknown ioctl operation of VDISK_AIO_FLUSH? If so, and > you want a placeholder here for flushing, you should go all the way and stub > out the underlying ioctl call to return success. Then QEMU can at least > rely on the error return from the flush operation. > >
I agree. Will change accordingly. I also completely agree on the discussion of libqnio calling a qemu function. Will check on the best way to resolve that and get back. Thanks, Ashish On Thu, Sep 8, 2016 at 7:00 AM, Jeff Cody <jc...@redhat.com> wrote: > On Wed, Sep 07, 2016 at 03:32:47PM -0700, ashish mittal wrote: >> Hi Jeff, >> >> Thank you for the comments and corrections. >> >> I have submitted a v5 patch after incorporating most of your review >> comments. Please find details in-line. >> >> Thanks, >> Ashish >> >> On Tue, Aug 30, 2016 at 10:35 AM, Jeff Cody <jc...@redhat.com> wrote: >> > >> > First-pass-over-the-code review: >> > >> > On Mon, Aug 22, 2016 at 11:56:30PM -0700, Ashish Mittal wrote: >> >> This patch adds support for a new block device type called "vxhs". >> >> Source code for the library that this code loads can be downloaded from: >> >> https://github.com/MittalAshish/libqnio.git >> >> >> >> Sample command line using JSON syntax: >> >> ./qemu-system-x86_64 -name instance-00000008 -S -vnc 0.0.0.0:0 -k en-us >> >> -vga cirrus -device virtio-balloon-pci,id=balloon0,bus=pci.0,addr=0x5 >> >> -msg timestamp=on >> >> 'json:{"driver":"vxhs","vdisk_id":"{c3e9095a-a5ee-4dce-afeb-2a59fb387410}","server":[{"host":"172.172.17.4","port":"9999"},{"host":"172.172.17.2","port":"9999"}]}' >> >> >> >> Sample command line using URI syntax: >> >> qemu-img convert -f raw -O raw -n >> >> /var/lib/nova/instances/_base/0c5eacd5ebea5ed914b6a3e7b18f1ce734c386ad >> >> vxhs://192.168.0.1:9999/%7Bc6718f6b-0401-441d-a8c3-1f0064d75ee0%7D >> >> >> > >> > Is there a reference specification for vxhs available? If so, the commit >> > message would be a good place to drop a link. >> > >> >> We don't have a reference specification doc yet. Although, I will >> shortly be adding a test program to libqnio that can be used to >> test/demo basic libqnio IO transfer. >> >> >> Signed-off-by: Ashish Mittal <ashish.mit...@veritas.com> >> >> --- >> >> v3 changelog: >> >> ============= >> >> (1) Reworked QAPI/JSON parsing. >> >> (2) Reworked URI parsing as suggested by Kevin. >> >> (3) Fixes per review comments from Stefan on v1. >> >> (4) Fixes per review comments from Daniel on v3. >> >> >> >> block/Makefile.objs | 2 + >> >> block/trace-events | 41 ++ >> >> block/vxhs.c | 1304 >> >> +++++++++++++++++++++++++++++++++++++++++++++++++++ >> >> block/vxhs.h | 237 ++++++++++ >> >> configure | 50 ++ >> >> 5 files changed, 1634 insertions(+) >> >> create mode 100644 block/vxhs.c >> >> create mode 100644 block/vxhs.h >> >> >> >> diff --git a/block/Makefile.objs b/block/Makefile.objs >> >> index 2593a2f..5f83305 100644 >> >> --- a/block/Makefile.objs >> >> +++ b/block/Makefile.objs >> >> @@ -20,6 +20,7 @@ block-obj-$(CONFIG_RBD) += rbd.o >> >> block-obj-$(CONFIG_GLUSTERFS) += gluster.o >> >> block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o >> >> block-obj-$(CONFIG_LIBSSH2) += ssh.o >> >> +block-obj-$(CONFIG_VXHS) += vxhs.o >> >> block-obj-y += accounting.o dirty-bitmap.o >> >> block-obj-y += write-threshold.o >> >> >> >> @@ -43,3 +44,4 @@ block-obj-m += dmg.o >> >> dmg.o-libs := $(BZIP2_LIBS) >> >> qcow.o-libs := -lz >> >> linux-aio.o-libs := -laio >> >> +vxhs.o-libs := $(VXHS_LIBS) >> >> diff --git a/block/trace-events b/block/trace-events >> >> index 05fa13c..06c6d8c 100644 >> >> --- a/block/trace-events >> >> +++ b/block/trace-events >> >> @@ -114,3 +114,44 @@ qed_aio_write_data(void *s, void *acb, int ret, >> >> uint64_t offset, size_t len) "s >> >> qed_aio_write_prefill(void *s, void *acb, uint64_t start, size_t len, >> >> uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 >> >> qed_aio_write_postfill(void *s, void *acb, uint64_t start, size_t len, >> >> uint64_t offset) "s %p acb %p start %"PRIu64" len %zu offset %"PRIu64 >> >> qed_aio_write_main(void *s, void *acb, int ret, uint64_t offset, size_t >> >> len) "s %p acb %p ret %d offset %"PRIu64" len %zu" >> >> + >> >> +# block/vxhs.c >> >> +vxhs_bdrv_init(const char c) "Registering VxHS AIO driver%c" >> >> +vxhs_iio_callback(int error, int reason) "ctx is NULL: error %d, reason >> >> %d" >> >> +vxhs_setup_qnio(void *s) "Context to HyperScale IO manager = %p" >> >> +vxhs_setup_qnio_nwerror(char c) "Could not initialize the network >> >> channel. Bailing out%c" >> >> +vxhs_iio_callback_iofail(int err, int reason, void *acb, int seg) >> >> "Read/Write failed: error %d, reason %d, acb %p, segment %d" >> >> +vxhs_iio_callback_retry(char *guid, void *acb) "vDisk %s, added acb %p >> >> to retry queue (5)" >> >> +vxhs_iio_callback_chnlfail(int error) "QNIO channel failed, no i/o (%d)" >> >> +vxhs_iio_callback_fail(int r, void *acb, int seg, uint64_t size, int >> >> err) " ALERT: reason = %d , acb = %p, acb->segments = %d, acb->size = %lu >> >> Error = %d" >> >> +vxhs_fail_aio(char * guid, void *acb) "vDisk %s, failing acb %p" >> >> +vxhs_iio_callback_ready(char *vd, int err) "async vxhs_iio_callback: >> >> IRP_VDISK_CHECK_IO_FAILOVER_READY completed for vdisk %s with error %d" >> >> +vxhs_iio_callback_chnfail(int err, int error) "QNIO channel failed, no >> >> i/o %d, %d" >> >> +vxhs_iio_callback_unknwn(int opcode, int err) "unexpected opcode %d, >> >> errno %d" >> >> +vxhs_open_fail(int ret) "Could not open the device. Error = %d" >> >> +vxhs_open_epipe(char c) "Could not create a pipe for device. Bailing >> >> out%c" >> >> +vxhs_aio_rw(char *guid, int iodir, uint64_t size, uint64_t offset) >> >> "vDisk %s, vDisk device is in failed state iodir = %d size = %lu offset = >> >> %lu" >> >> +vxhs_aio_rw_retry(char *guid, void *acb, int queue) "vDisk %s, added acb >> >> %p to retry queue(%d)" >> >> +vxhs_aio_rw_invalid(int req) "Invalid I/O request iodir %d" >> >> +vxhs_aio_rw_ioerr(char *guid, int iodir, uint64_t size, uint64_t off, >> >> void *acb, int seg, int ret, int err) "IO ERROR (vDisk %s) FOR : >> >> Read/Write = %d size = %lu offset = %lu ACB = %p Segments = %d. Error = >> >> %d, errno = %d" >> >> +vxhs_co_flush(char *guid, int ret, int err) "vDisk (%s) Flush ioctl >> >> failed ret = %d errno = %d" >> >> +vxhs_get_vdisk_stat_err(char *guid, int ret, int err) "vDisk (%s) stat >> >> ioctl failed, ret = %d, errno = %d" >> >> +vxhs_get_vdisk_stat(char *vdisk_guid, uint64_t vdisk_size) "vDisk %s >> >> stat ioctl returned size %lu" >> >> +vxhs_switch_storage_agent(char *ip, char *guid) "Query host %s for vdisk >> >> %s" >> >> +vxhs_switch_storage_agent_failed(char *ip, char *guid, int res, int err) >> >> "Query to host %s for vdisk %s failed, res = %d, errno = %d" >> >> +vxhs_failover_ioctl_cb(char *ip, char *guid) "Switched to storage server >> >> host-IP %s for vdisk %s" >> >> +vxhs_failover_ioctl_cb_retry(char *guid) "failover_ioctl_cb: keep >> >> looking for io target for vdisk %s" >> >> +vxhs_failover_io(char *vdisk) "I/O Failover starting for vDisk %s" >> >> +vxhs_reopen_vdisk(char *ip) "Failed to connect to storage agent on >> >> host-ip %s" >> >> +vxhs_reopen_vdisk_openfail(char *fname) "Failed to open vdisk device: %s" >> >> +vxhs_handle_queued_ios(void *acb, int res) "Restarted acb %p res %d" >> >> +vxhs_restart_aio(int dir, int res, int err) "IO ERROR FOR: Read/Write = >> >> %d Error = %d, errno = %d" >> >> +vxhs_complete_aio(void *acb, uint64_t ret) "aio failed acb %p ret %ld" >> >> +vxhs_aio_rw_iofail(char *guid) "vDisk %s, I/O operation failed." >> >> +vxhs_aio_rw_devfail(char *guid, int dir, uint64_t size, uint64_t off) >> >> "vDisk %s, vDisk device failed iodir = %d size = %lu offset = %lu" >> >> +vxhs_parse_uri_filename(const char *filename) "URI passed via >> >> bdrv_parse_filename %s" >> >> +vxhs_qemu_init_vdisk(const char *vdisk_id) "vdisk_id from json %s" >> >> +vxhs_qemu_init_numservers(int num_servers) "Number of servers passed = >> >> %d" >> >> +vxhs_parse_uri_hostinfo(int num, char *host, int port) "Host %d: IP %s, >> >> Port %d" >> >> +vxhs_qemu_init(char *of_vsa_addr, int port) "Adding host %s:%d to >> >> BDRVVXHSState" >> >> +vxhs_qemu_init_filename(const char *filename) "Filename passed as %s" >> >> diff --git a/block/vxhs.c b/block/vxhs.c >> >> new file mode 100644 >> >> index 0000000..32c37c5 >> >> --- /dev/null >> >> +++ b/block/vxhs.c >> >> @@ -0,0 +1,1304 @@ >> >> +/* >> >> + * QEMU Block driver for Veritas HyperScale (VxHS) >> >> + * >> > >> > Another good place for a link to the specification, if available. >> > >> > >> >> + * This work is licensed under the terms of the GNU GPL, version 2 or >> >> later. >> >> + * See the COPYING file in the top-level directory. >> >> + * >> >> + */ >> >> + >> >> +#include "vxhs.h" >> >> +#include <qnio/qnio_api.h> >> >> +#include "trace.h" >> >> +#include "qapi/qmp/qerror.h" >> >> +#include "qapi/qmp/qdict.h" >> >> +#include "qapi/qmp/qstring.h" >> >> + >> >> +#define VXHS_OPT_FILENAME "filename" >> >> +#define VXHS_OPT_VDISK_ID "vdisk_id" >> >> +#define VXHS_OPT_SERVER "server." >> >> +#define VXHS_OPT_HOST "host" >> >> +#define VXHS_OPT_PORT "port" >> >> + >> >> +/* qnio client ioapi_ctx */ >> >> +static void *global_qnio_ctx; >> >> + >> >> +/* vdisk prefix to pass to qnio */ >> >> +static const char vdisk_prefix[] = "/dev/of/vdisk"; >> >> + >> >> +void vxhs_inc_acb_segment_count(void *ptr, int count) >> >> +{ >> >> + VXHSAIOCB *acb = ptr; >> >> + BDRVVXHSState *s = acb->common.bs->opaque; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + acb->segments += count; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> +} >> >> + >> >> +void vxhs_dec_acb_segment_count(void *ptr, int count) >> >> +{ >> >> + VXHSAIOCB *acb = ptr; >> >> + BDRVVXHSState *s = acb->common.bs->opaque; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + acb->segments -= count; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> +} >> >> + >> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count) >> >> +{ >> >> + VXHSAIOCB *acb = ptr; >> >> + BDRVVXHSState *s = acb->common.bs->opaque; >> >> + int segcount = 0; >> >> + >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + acb->segments -= count; >> >> + segcount = acb->segments; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> + >> >> + return segcount; >> >> +} >> > >> > Unused function? >> > >> >> Yes, it is unused. Sorry I forgot to remove it from V5. Will remember >> to remove in the next iteration. >> >> >> + >> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer) >> >> +{ >> >> + VXHSAIOCB *acb = ptr; >> >> + >> >> + acb->buffer = buffer; >> >> +} >> >> + >> > >> > Unused function? >> > >> >> This is called from within libqnio. >> >> > (There are several unused functions - are there other patches that use >> > these >> > functions? Maybe I missed them on the list. If not, and they are for >> > future work, maybe it would be best to introduce these functions in the >> > patch series that will actually use them. It will cut down on the amount of >> > code to be reviewed, and since it is not exercised it can't be verified >> > anyway). >> > >> >> Removed all unused functions, except vxhs_dec_and_get_acb_segment_count(). >> >> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t count) >> >> +{ >> >> + BDRVVXHSState *s = ptr; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + s->vdisk_aio_count += count; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> +} >> >> + >> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t count) >> >> +{ >> >> + BDRVVXHSState *s = ptr; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + s->vdisk_aio_count -= count; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> +} >> >> + >> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr) >> >> +{ >> >> + BDRVVXHSState *s = ptr; >> >> + uint32_t count = 0; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + count = s->vdisk_aio_count; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + >> >> + return count; >> >> +} >> > >> > Function only used by another function that is not called from anywhere. >> > >> > >> >> Removed. >> >> >> + >> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void *m) >> >> +{ >> >> + VXHSAIOCB *acb = NULL; >> >> + BDRVVXHSState *s = NULL; >> >> + int rv = 0; >> >> + int segcount = 0; >> >> + uint32_t error = 0; >> >> + uint32_t opcode = 0; >> >> + >> >> + assert(m); >> >> + if (m) { >> >> + /* TODO: need common get message attrs, not two separate lib >> >> calls */ >> >> + error = qemu_iio_extract_msg_error(m); >> >> + opcode = qemu_iio_extract_msg_opcode(m); >> >> + } >> >> + switch (opcode) { >> >> + case IRP_READ_REQUEST: >> >> + case IRP_WRITE_REQUEST: >> >> + >> >> + /* >> >> + * ctx is VXHSAIOCB* >> >> + * ctx is NULL if error is VXERROR_CHANNEL_HUP or reason is >> >> IIO_REASON_HUP >> >> + */ >> >> + if (ctx) { >> >> + acb = ctx; >> >> + s = acb->common.bs->opaque; >> >> + } else { >> >> + trace_vxhs_iio_callback(error, reason); >> >> + goto out; >> >> + } >> >> + >> >> + if (error) { >> >> + trace_vxhs_iio_callback_iofail(error, reason, acb, >> >> acb->segments); >> >> + >> >> + if (reason == IIO_REASON_DONE || reason == IIO_REASON_EVENT) { >> >> + /* >> >> + * Storage agent failed while I/O was in progress >> >> + * Fail over only if the qnio channel dropped, indicating >> >> + * storage agent failure. Don't fail over in response to >> >> other >> >> + * I/O errors such as disk failure. >> >> + */ >> >> + if (error == VXERROR_RETRY_ON_SOURCE || error == VXERROR_HUP >> >> || >> >> + error == VXERROR_CHANNEL_HUP || error == -1) { >> >> + /* >> >> + * Start vDisk IO failover once callback is >> >> + * called against all the pending IOs. >> >> + * If vDisk has no redundancy enabled >> >> + * then IO failover routine will mark >> >> + * the vDisk failed and fail all the >> >> + * AIOs without retry (stateless vDisk) >> >> + */ >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + if (!OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { >> >> + OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s); >> >> + } >> >> + /* >> >> + * Check if this acb is already queued before. >> >> + * It is possible in case if I/Os are submitted >> >> + * in multiple segments (QNIO_MAX_IO_SIZE). >> >> + */ >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + if (!OF_AIOCB_FLAGS_QUEUED(acb)) { >> >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, >> >> + acb, retry_entry); >> >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> >> + s->vdisk_aio_retry_qd++; >> >> + trace_vxhs_iio_callback_retry(s->vdisk_guid, acb); >> >> + } >> >> + segcount = --acb->segments; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> + /* >> >> + * Decrement AIO count only when callback is called >> >> + * against all the segments of aiocb. >> >> + */ >> >> + if (segcount == 0 && --s->vdisk_aio_count == 0) { >> >> + /* >> >> + * Start vDisk I/O failover >> >> + */ >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + /* >> >> + * TODO: >> >> + * Need to explore further if it is possible to >> >> optimize >> >> + * the failover operation on Virtual-Machine (global) >> >> + * specific rather vDisk specific. >> >> + */ >> >> + vxhs_failover_io(s); >> >> + goto out; >> >> + } >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto out; >> >> + } >> >> + } else if (reason == IIO_REASON_HUP) { >> >> + /* >> >> + * Channel failed, spontaneous notification, >> >> + * not in response to I/O >> >> + */ >> >> + trace_vxhs_iio_callback_chnlfail(error); >> >> + /* >> >> + * TODO: Start channel failover when no I/O is outstanding >> >> + */ >> >> + goto out; >> >> + } else { >> >> + trace_vxhs_iio_callback_fail(reason, acb, acb->segments, >> >> + acb->size, error); >> >> + } >> >> + } >> >> + /* >> >> + * Set error into acb if not set. In case if acb is being >> >> + * submitted in multiple segments then need to set the error >> >> + * only once. >> >> + * >> >> + * Once acb done callback is called for the last segment >> >> + * then acb->ret return status will be sent back to the >> >> + * caller. >> >> + */ >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + if (error && !acb->ret) { >> >> + acb->ret = error; >> >> + } >> >> + --acb->segments; >> >> + segcount = acb->segments; >> >> + assert(segcount >= 0); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> + /* >> >> + * Check if all the outstanding I/Os are done against acb. >> >> + * If yes then send signal for AIO completion. >> >> + */ >> >> + if (segcount == 0) { >> >> + rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); >> >> + if (rv != sizeof(acb)) { >> >> + error_report("VXHS AIO completion failed: %s", >> >> strerror(errno)); >> >> + abort(); >> >> + } >> >> + } >> >> + break; >> >> + >> >> + case IRP_VDISK_CHECK_IO_FAILOVER_READY: >> >> + /* ctx is BDRVVXHSState* */ >> >> + assert(ctx); >> >> + trace_vxhs_iio_callback_ready(((BDRVVXHSState *)ctx)->vdisk_guid, >> >> + error); >> >> + vxhs_failover_ioctl_cb(error, ctx); >> >> + break; >> >> + >> >> + default: >> >> + if (reason == IIO_REASON_HUP) { >> >> + /* >> >> + * Channel failed, spontaneous notification, >> >> + * not in response to I/O >> >> + */ >> >> + trace_vxhs_iio_callback_chnfail(error, errno); >> >> + /* >> >> + * TODO: Start channel failover when no I/O is outstanding >> >> + */ >> >> + } else { >> >> + trace_vxhs_iio_callback_unknwn(opcode, error); >> >> + } >> >> + break; >> >> + } >> >> +out: >> >> + return; >> >> +} >> >> + >> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s) >> >> +{ >> >> + BlockCompletionFunc *cb = acb->common.cb; >> >> + void *opaque = acb->common.opaque; >> >> + int ret = 0; >> >> + >> >> + if (acb->ret != 0) { >> >> + trace_vxhs_complete_aio(acb, acb->ret); >> >> + /* >> >> + * We mask all the IO errors generically as EIO for upper layers >> >> + * Right now our IO Manager uses non standard error codes. Instead >> >> + * of confusing upper layers with incorrect interpretation we are >> >> + * doing this workaround. >> >> + */ >> >> + ret = (-EIO); >> >> + } >> >> + /* >> >> + * Copy back contents from stablization buffer into original iovector >> >> + * before returning the IO >> >> + */ >> >> + if (acb->buffer != NULL) { >> >> + qemu_iovec_from_buf(acb->qiov, 0, acb->buffer, acb->qiov->size); >> >> + free(acb->buffer); >> >> + acb->buffer = NULL; >> >> + } >> >> + vxhs_dec_vdisk_iocount(s, 1); >> >> + acb->aio_done = VXHS_IO_COMPLETED; >> >> + qemu_aio_unref(acb); >> >> + cb(opaque, ret); >> >> +} >> >> + >> >> +/* >> >> + * This is the HyperScale event handler registered to QEMU. >> >> + * It is invoked when any IO gets completed and written on pipe >> >> + * by callback called from QNIO thread context. Then it marks >> >> + * the AIO as completed, and releases HyperScale AIO callbacks. >> >> + */ >> >> +void vxhs_aio_event_reader(void *opaque) >> >> +{ >> >> + BDRVVXHSState *s = opaque; >> >> + ssize_t ret; >> >> + >> >> + do { >> >> + char *p = (char *)&s->qnio_event_acb; >> >> + >> >> + ret = read(s->fds[VDISK_FD_READ], p + s->event_reader_pos, >> >> + sizeof(s->qnio_event_acb) - s->event_reader_pos); >> >> + if (ret > 0) { >> >> + s->event_reader_pos += ret; >> >> + if (s->event_reader_pos == sizeof(s->qnio_event_acb)) { >> >> + s->event_reader_pos = 0; >> >> + vxhs_complete_aio(s->qnio_event_acb, s); >> >> + } >> >> + } >> >> + } while (ret < 0 && errno == EINTR); >> >> +} >> >> + >> >> +/* >> >> + * QEMU calls this to check if there are any pending IO on vDisk. >> >> + * It will wait in a loop until all the AIOs are completed. >> >> + */ >> >> +int vxhs_aio_flush_cb(void *opaque) >> >> +{ >> >> + BDRVVXHSState *s = opaque; >> >> + >> >> + return vxhs_get_vdisk_iocount(s); >> >> +} >> > >> > Unused function. >> > >> > >> >> Removed. >> >> >> + >> >> +/* >> >> + * This will be called by QEMU while booting for each vDisk. >> >> + * bs->opaque will be allocated by QEMU upper block layer before >> >> + * calling open. It will load all the QNIO operations from >> >> + * qemuqnio library and call QNIO operation to create channel to >> >> + * do IO on vDisk. It parses the URI, gets the hostname, vDisk >> >> + * path and then sets HyperScale event handler to QEMU. >> >> + */ >> >> +void *vxhs_setup_qnio(void) >> >> +{ >> >> + void *qnio_ctx = NULL; >> >> + >> >> + qnio_ctx = qemu_iio_init(vxhs_iio_callback); >> > >> > One concern with this callback function - when does it get called? I am >> > assuming that qemu_iio_init() is running threads / processes in the >> > background, so this callback could happen at anytime. >> > >> > (I'll discuss my specific concern later, in vxhs_failover_ioctl_cb()) >> > >> > >> > >> >> Yes, that is correct. vxhs_setup_qnio() is called from the threads >> running in libqnio. >> >> >> >> > An aside: >> > >> > It is odd that the qnio library is using a qemu_ prefix for functions. >> > Seems like a good way to run into issues later, especially with relatively >> > generic function names such as 'qemu_ck_spin_lock()'. Consider that most >> > developers / maintainers are probably not going to compile in vxhs given >> > the >> > external library dependence without a specific reason, so name collisions >> > could sneak in. >> > >> > 'nm -g libqnioshim.so | grep qemu_' shows a few that give me pause: >> > >> > 0000000000004d07 T qemu_calculate_iovec_size >> > 000000000000663d T qemu_ck_destroy_lock >> > 00000000000065cf T qemu_ck_initialize_lock >> > 00000000000065f7 T qemu_ck_spin_lock >> > 000000000000661a T qemu_ck_spin_unlock >> > 0000000000004dfb T qemu_convert_iovector_to_buffer >> > 0000000000004d63 T qemu_copy_iov_to_buffer >> > 0000000000005f2f T qemu_extract_flush_response >> > 0000000000005c89 T qemu_extract_geometry_from_json >> > 0000000000005a78 T qemu_extract_size_from_json >> > 0000000000004f40 T qemu_is_iovector_read_aligned >> > >> > >> >> Replaced all qemu_ prefixed symbols within qnio. >> >> > >> >> + >> >> + if (qnio_ctx != NULL) { >> >> + trace_vxhs_setup_qnio(qnio_ctx); >> >> + } else { >> >> + trace_vxhs_setup_qnio_nwerror('.'); >> >> + } >> >> + >> >> + return qnio_ctx; >> >> +} >> >> + >> >> +static QemuOptsList runtime_opts = { >> >> + .name = "vxhs", >> >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), >> >> + .desc = { >> >> + { >> >> + .name = VXHS_OPT_FILENAME, >> >> + .type = QEMU_OPT_STRING, >> >> + .help = "URI to the Veritas HyperScale image", >> >> + }, >> >> + { >> >> + .name = VXHS_OPT_VDISK_ID, >> >> + .type = QEMU_OPT_STRING, >> >> + .help = "UUID of the VxHS vdisk", >> >> + }, >> >> + { /* end of list */ } >> >> + }, >> >> +}; >> >> + >> >> +static QemuOptsList runtime_tcp_opts = { >> >> + .name = "vxhs_tcp", >> >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_tcp_opts.head), >> >> + .desc = { >> >> + { >> >> + .name = VXHS_OPT_HOST, >> >> + .type = QEMU_OPT_STRING, >> >> + .help = "host address (ipv4 addresses)", >> >> + }, >> >> + { >> >> + .name = VXHS_OPT_PORT, >> >> + .type = QEMU_OPT_NUMBER, >> >> + .help = "port number on which VxHSD is listening (default >> >> 9999)", >> >> + .def_value_str = "9999" >> >> + }, >> >> + { >> >> + .name = "to", >> >> + .type = QEMU_OPT_NUMBER, >> >> + .help = "max port number, not supported by VxHS", >> >> + }, >> >> + { >> >> + .name = "ipv4", >> >> + .type = QEMU_OPT_BOOL, >> >> + .help = "ipv4 bool value, not supported by VxHS", >> >> + }, >> >> + { >> >> + .name = "ipv6", >> >> + .type = QEMU_OPT_BOOL, >> >> + .help = "ipv6 bool value, not supported by VxHS", >> >> + }, >> >> + { /* end of list */ } >> >> + }, >> >> +}; >> >> + >> >> +/* >> >> + * Parse the incoming URI and populate *options with all the host(s) >> >> + * information. Host at index 0 is local storage agent. >> >> + * Remaining are the reflection target storage agents. The local storage >> >> agent >> >> + * ip is the efficient internal address in the uri, e.g. 192.168.0.2. >> >> + * The local storage agent address is stored at index 0. The reflection >> >> target >> >> + * ips, are the E-W data network addresses of the reflection node >> >> agents, also >> >> + * extracted from the uri. >> >> + */ >> >> +static int vxhs_parse_uri(const char *filename, QDict *options) >> >> +{ >> >> + gchar **target_list; >> >> + URI *uri = NULL; >> >> + char *hoststr, *portstr; >> >> + char *port; >> >> + int ret = 0; >> >> + int i = 0; >> >> + >> >> + trace_vxhs_parse_uri_filename(filename); >> >> + target_list = g_strsplit(filename, "%7D", 0); >> >> + assert(target_list != NULL && target_list[0] != NULL); >> >> + >> >> + for (i = 0; target_list[i] != NULL && *target_list[i]; i++) { >> >> + uri = uri_parse(target_list[i]); >> >> + if (!uri || !uri->server) { >> >> + uri_free(uri); >> >> + ret = -EINVAL; >> >> + break; >> >> + } >> >> + >> >> + hoststr = g_strdup_printf(VXHS_OPT_SERVER"%d.host", i); >> >> + qdict_put(options, hoststr, qstring_from_str(uri->server)); >> >> + >> >> + portstr = g_strdup_printf(VXHS_OPT_SERVER"%d.port", i); >> >> + if (uri->port) { >> >> + port = g_strdup_printf("%d", uri->port); >> >> + qdict_put(options, portstr, qstring_from_str(port)); >> >> + g_free(port); >> >> + } >> >> + >> >> + if (i == 0 && (strstr(uri->path, "vxhs") == NULL)) { >> >> + qdict_put(options, "vdisk_id", qstring_from_str(uri->path)); >> >> + } >> >> + >> >> + trace_vxhs_parse_uri_hostinfo(i + 1, uri->server, uri->port); >> >> + g_free(hoststr); >> >> + g_free(portstr); >> >> + uri_free(uri); >> >> + } >> >> + >> >> + g_strfreev(target_list); >> >> + return ret; >> >> +} >> >> + >> >> +static void vxhs_parse_filename(const char *filename, QDict *options, >> >> + Error **errp) >> >> +{ >> >> + if (qdict_haskey(options, "host") >> >> + || qdict_haskey(options, "port") >> >> + || qdict_haskey(options, "path")) >> >> + { >> >> + error_setg(errp, "host/port/path and a file name may not be >> >> specified " >> >> + "at the same time"); >> >> + return; >> >> + } >> >> + >> >> + if (strstr(filename, "://")) { >> >> + int ret = vxhs_parse_uri(filename, options); >> >> + if (ret < 0) { >> >> + error_setg(errp, "Invalid URI. URI should be of the form " >> >> + " vxhs://<host_ip>:<port>/{<vdisk_id>}"); >> >> + } >> >> + } >> >> +} >> >> + >> >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, >> >> + int *cfd, int *rfd, Error **errp) >> >> +{ >> >> + QDict *backing_options = NULL; >> >> + const char *vxhs_filename; >> >> + char *of_vsa_addr = NULL; >> >> + Error *local_err = NULL; >> >> + const char *ptr = NULL; >> > >> > ptr does not need to be initialized to NULL here. I also recommend using a >> > more descriptive name like 'vdisk_id_opt' over something generic like >> > 'ptr'. >> > >> >> Fixed. >> >> >> >> + char *file_name = NULL; >> >> + size_t num_servers; >> >> + char *str = NULL; >> >> + QemuOpts *opts; >> >> + int ret = 0; >> >> + int i; >> >> + >> >> + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); >> >> + qemu_opts_absorb_qdict(opts, options, &local_err); >> >> + if (local_err) { >> >> + ret = -EINVAL; >> >> + goto out; >> >> + } >> >> + >> >> + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); >> >> + if (vxhs_filename) { >> >> + trace_vxhs_qemu_init_filename(vxhs_filename); >> >> + } >> >> + >> >> + ptr = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); >> >> + if (!ptr) { >> >> + error_setg(&local_err, QERR_MISSING_PARAMETER, >> >> VXHS_OPT_VDISK_ID); >> >> + ret = -EINVAL; >> >> + goto out; >> >> + } >> >> + s->vdisk_guid = g_strdup(ptr); >> > >> > This is never freed anywhere. >> > >> >> Now freed on error conditions and in vxhs_close(). >> >> >> + trace_vxhs_qemu_init_vdisk(ptr); >> >> + >> >> + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); >> >> + if (num_servers < 1) { >> >> + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); >> >> + ret = -EINVAL; >> >> + goto out; >> >> + } >> >> + trace_vxhs_qemu_init_numservers(num_servers); >> >> + >> >> + qemu_opts_del(opts); >> >> + >> >> + for (i = 0; i < num_servers; i++) { >> > >> > s->vdisk_hostinfo[] is an array with MAX_HOSTS (4) as a size. There is no >> > bounds checking on num_servers, which is an issue: we can easily blow >> > past this. >> > >> > In general, for any parameter supplied by a user, or read from a file / >> > protocol / etc, we need to do appropriate bounds checking to prevent >> > potential CVEs. >> > >> >> Fixed. >> >> >> + str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i); >> >> + qdict_extract_subqdict(options, &backing_options, str); >> >> + >> >> + /* create opts info from runtime_tcp_opts list */ >> >> + opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, >> >> &error_abort); >> >> + qemu_opts_absorb_qdict(opts, backing_options, &local_err); >> >> + if (local_err) { >> >> + ret = -EINVAL; >> >> + qemu_opts_del(opts); >> >> + goto out; >> >> + } >> >> + >> >> + s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(opts, >> >> + >> >> VXHS_OPT_HOST)); >> > >> > This is never freed anywhere. >> > >> >> Now freed on error and vxhs_close(). >> >> >> + s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(opts, >> >> + >> >> VXHS_OPT_PORT), >> >> + NULL, 0); >> >> + >> >> + s->vdisk_hostinfo[i].qnio_cfd = -1; >> >> + s->vdisk_hostinfo[i].vdisk_rfd = -1; >> >> + trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip, >> >> + s->vdisk_hostinfo[i].port); >> >> + qemu_opts_del(opts); >> >> + } >> >> + >> >> + s->vdisk_nhosts = i; >> >> + s->vdisk_cur_host_idx = 0; >> >> + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); >> >> + of_vsa_addr = g_strdup_printf("of://%s:%d", >> >> + >> >> s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, >> >> + >> >> s->vdisk_hostinfo[s->vdisk_cur_host_idx].port); >> >> + >> >> + /* >> >> + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. >> >> + */ >> >> + if (global_qnio_ctx == NULL) { >> >> + global_qnio_ctx = vxhs_setup_qnio(); >> >> + if (global_qnio_ctx == NULL) { >> >> + error_setg(&local_err, "Failed vxhs_setup_qnio"); >> >> + ret = -EINVAL; >> >> + goto out; >> >> + } >> >> + } >> >> + >> >> + *cfd = qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0); >> >> + if (*cfd < 0) { >> >> + error_setg(&local_err, "Failed qemu_open_iio_conn"); >> >> + ret = -EIO; >> >> + goto out; >> >> + } >> >> + *rfd = qemu_iio_devopen(global_qnio_ctx, *cfd, file_name, 0); >> >> + if (*rfd < 0) { >> >> + error_setg(&local_err, "Failed qemu_iio_devopen"); >> >> + ret = -EIO; >> >> + goto out; >> >> + } >> >> + >> >> +out: >> >> + g_free(file_name); >> >> + g_free(of_vsa_addr); >> >> + if (str) { >> >> + qdict_del(backing_options, str); >> >> + g_free(str); >> >> + } >> >> + >> >> + if (ret < 0) { >> >> + errno = -ret; >> >> + } >> >> + error_propagate(errp, local_err); >> >> + return ret; >> >> +} >> >> + >> >> +int vxhs_open(BlockDriverState *bs, QDict *options, >> >> + int bdrv_flags, Error **errp) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + AioContext *aio_context; >> >> + int qemu_qnio_cfd = -1; >> >> + int qemu_rfd = -1; >> >> + int ret = 0; >> >> + >> >> + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); >> >> + if (ret < 0) { >> >> + trace_vxhs_open_fail(ret); >> >> + return ret; >> >> + } >> >> + >> >> + s->qnio_ctx = global_qnio_ctx; >> >> + s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd; >> >> + s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd; >> >> + s->vdisk_size = 0; >> >> + QSIMPLEQ_INIT(&s->vdisk_aio_retryq); >> >> + >> >> + /* >> >> + * Create a pipe for communicating between two threads in different >> >> + * context. Set handler for read event, which gets triggered when >> >> + * IO completion is done by non-QEMU context. >> >> + */ >> >> + ret = qemu_pipe(s->fds); >> >> + if (ret < 0) { >> >> + trace_vxhs_open_epipe('.'); >> >> + ret = -errno; >> >> + goto out; >> >> + } >> >> + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); >> >> + >> >> + aio_context = bdrv_get_aio_context(bs); >> >> + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], >> >> + false, vxhs_aio_event_reader, NULL, s); >> >> + >> >> + /* >> >> + * Allocate/Initialize the spin-locks. >> >> + * >> >> + * NOTE: >> >> + * Since spin lock is being allocated >> >> + * dynamically hence moving acb struct >> >> + * specific lock to BDRVVXHSState >> >> + * struct. The reason being, >> >> + * we don't want the overhead of spin >> >> + * lock being dynamically allocated and >> >> + * freed for every AIO. >> >> + */ >> >> + s->vdisk_lock = VXHS_SPIN_LOCK_ALLOC; >> >> + s->vdisk_acb_lock = VXHS_SPIN_LOCK_ALLOC; >> >> + >> >> + return 0; >> >> + >> >> +out: >> >> + if (s->vdisk_hostinfo[0].vdisk_rfd >= 0) { >> >> + qemu_iio_devclose(s->qnio_ctx, 0, >> >> + s->vdisk_hostinfo[0].vdisk_rfd); >> >> + } >> >> + /* never close qnio_cfd */ >> > >> > Surely we want to be able to close it at some point... or does the qnio >> > library take care of that cleanup somewhere/sometime? >> > >> > >> >> Changed to close channel fd on error conditions, vxhs_close() and >> vxhs_reopen_vdisk(). >> >> >> >> + trace_vxhs_open_fail(ret); >> >> + return ret; >> >> +} >> >> + >> >> +static const AIOCBInfo vxhs_aiocb_info = { >> >> + .aiocb_size = sizeof(VXHSAIOCB) >> >> +}; >> >> + >> >> +/* >> >> + * This is called in QNIO thread context when IO done >> >> + * on IO Manager and QNIO client received the data or >> >> + * ACK. It notify another event handler thread running in QEMU context >> >> + * by writing on the pipe >> >> + */ >> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg) >> >> +{ >> >> + VXHSAIOCB *acb = arg; >> >> + BlockDriverState *bs = acb->common.bs; >> >> + BDRVVXHSState *s = bs->opaque; >> >> + int rv; >> >> + >> >> + acb->ret = ret; >> >> + rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); >> >> + if (rv != sizeof(acb)) { >> >> + error_report("VXHS AIO completion failed: %s", >> >> + strerror(errno)); >> >> + abort(); >> >> + } >> >> +} >> > >> > Another unused function. >> > >> >> Removed. >> >> >> >> + >> >> +/* >> >> + * This allocates QEMU-VXHS callback for each IO >> >> + * and is passed to QNIO. When QNIO completes the work, >> >> + * it will be passed back through the callback. >> >> + */ >> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, >> >> + int64_t sector_num, QEMUIOVector *qiov, >> >> + int nb_sectors, >> >> + BlockCompletionFunc *cb, >> >> + void *opaque, int iodir) >> >> +{ >> >> + VXHSAIOCB *acb = NULL; >> >> + BDRVVXHSState *s = bs->opaque; >> >> + size_t size; >> >> + uint64_t offset; >> >> + int iio_flags = 0; >> >> + int ret = 0; >> >> + >> >> + offset = sector_num * BDRV_SECTOR_SIZE; >> >> + size = nb_sectors * BDRV_SECTOR_SIZE; >> >> + >> >> + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); >> >> + /* >> >> + * Setup or initialize VXHSAIOCB. >> >> + * Every single field should be initialized since >> >> + * acb will be picked up from the slab without >> >> + * initializing with zero. >> >> + */ >> >> + acb->io_offset = offset; >> >> + acb->size = size; >> >> + acb->ret = 0; >> >> + acb->flags = 0; >> >> + acb->aio_done = VXHS_IO_INPROGRESS; >> >> + acb->segments = 0; >> >> + acb->buffer = 0; >> >> + acb->qiov = qiov; >> >> + acb->direction = iodir; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + if (OF_VDISK_FAILED(s)) { >> >> + trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto errout; >> >> + } >> >> + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { >> >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); >> >> + s->vdisk_aio_retry_qd++; >> >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1); >> >> + goto out; >> >> + } >> >> + s->vdisk_aio_count++; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + >> >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); >> >> + >> >> + switch (iodir) { >> >> + case VDISK_AIO_WRITE: >> >> + vxhs_inc_acb_segment_count(acb, 1); >> >> + ret = qemu_iio_writev(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + qiov->iov, qiov->niov, offset, (void *)acb, >> >> iio_flags); >> >> + break; >> >> + case VDISK_AIO_READ: >> >> + vxhs_inc_acb_segment_count(acb, 1); >> >> + ret = qemu_iio_readv(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + qiov->iov, qiov->niov, offset, (void *)acb, >> >> iio_flags); >> >> + break; >> >> + default: >> >> + trace_vxhs_aio_rw_invalid(iodir); >> >> + goto errout; >> >> + } >> >> + >> >> + if (ret != 0) { >> >> + trace_vxhs_aio_rw_ioerr( >> >> + s->vdisk_guid, iodir, size, offset, >> >> + acb, acb->segments, ret, errno); >> >> + /* >> >> + * Don't retry I/Os against vDisk having no >> >> + * redundancy or stateful storage on compute >> >> + * >> >> + * TODO: Revisit this code path to see if any >> >> + * particular error needs to be handled. >> >> + * At this moment failing the I/O. >> >> + */ >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + if (s->vdisk_nhosts == 1) { >> >> + trace_vxhs_aio_rw_iofail(s->vdisk_guid); >> >> + s->vdisk_aio_count--; >> >> + vxhs_dec_acb_segment_count(acb, 1); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto errout; >> >> + } >> >> + if (OF_VDISK_FAILED(s)) { >> >> + trace_vxhs_aio_rw_devfail( >> >> + s->vdisk_guid, iodir, size, offset); >> >> + s->vdisk_aio_count--; >> >> + vxhs_dec_acb_segment_count(acb, 1); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto errout; >> >> + } >> >> + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { >> >> + /* >> >> + * Queue all incoming io requests after failover starts. >> >> + * Number of requests that can arrive is limited by io queue >> >> depth >> >> + * so an app blasting independent ios will not exhaust >> >> memory. >> >> + */ >> >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); >> >> + s->vdisk_aio_retry_qd++; >> >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> >> + s->vdisk_aio_count--; >> >> + vxhs_dec_acb_segment_count(acb, 1); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 2); >> >> + goto out; >> >> + } >> >> + OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s); >> >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); >> >> + s->vdisk_aio_retry_qd++; >> >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> >> + vxhs_dec_acb_segment_count(acb, 1); >> >> + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 3); >> >> + /* >> >> + * Start I/O failover if there is no active >> >> + * AIO within vxhs block driver. >> >> + */ >> >> + if (--s->vdisk_aio_count == 0) { >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + /* >> >> + * Start IO failover >> >> + */ >> >> + vxhs_failover_io(s); >> >> + goto out; >> >> + } >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + } >> >> + >> >> +out: >> >> + return &acb->common; >> >> + >> >> +errout: >> >> + qemu_aio_unref(acb); >> >> + return NULL; >> >> +} >> >> + >> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, >> >> + int64_t sector_num, QEMUIOVector >> >> *qiov, >> >> + int nb_sectors, >> >> + BlockCompletionFunc *cb, void *opaque) >> >> +{ >> >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, >> >> + cb, opaque, VDISK_AIO_READ); >> >> +} >> >> + >> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, >> >> + int64_t sector_num, QEMUIOVector >> >> *qiov, >> >> + int nb_sectors, >> >> + BlockCompletionFunc *cb, void >> >> *opaque) >> >> +{ >> >> + return vxhs_aio_rw(bs, sector_num, qiov, nb_sectors, >> >> + cb, opaque, VDISK_AIO_WRITE); >> >> +} >> >> + >> >> +/* >> >> + * This is called by QEMU when a flush gets triggered from within >> >> + * a guest at the block layer, either for IDE or SCSI disks. >> >> + */ >> >> +int vxhs_co_flush(BlockDriverState *bs) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + uint64_t size = 0; >> >> + int ret = 0; >> >> + >> >> + ret = qemu_iio_ioctl(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + VDISK_AIO_FLUSH, &size, NULL, IIO_FLAG_SYNC); >> >> + >> >> + if (ret < 0) { >> >> + /* >> >> + * Currently not handling the flush ioctl >> >> + * failure because of network connection >> >> + * disconnect. Since all the writes are >> >> + * commited into persistent storage hence >> >> + * this flush call is noop and we can safely >> >> + * return success status to the caller. >> > >> > I'm not sure I understand here. Are you saying the qemu_iio_ioctl() call >> > above is a noop? >> > >> >> Yes, qemu_iio_ioctl(VDISK_AIO_FLUSH) is only a place-holder at present >> in case we later want to add some functionality to it. I have now >> added a comment to this affect to avoid any confusion. >> > > The problem is you don't know which version of the qnio library any given > QEMU binary will be using, since it is a shared library. Future versions > may implement the flush ioctl as expressed above, in which case we may hide > a valid error. > > Am I correct in assuming that this call suppresses errors because an error > is returned for an unknown ioctl operation of VDISK_AIO_FLUSH? If so, and > you want a placeholder here for flushing, you should go all the way and stub > out the underlying ioctl call to return success. Then QEMU can at least > rely on the error return from the flush operation. > > >> >> + * >> >> + * If any write failure occurs for inflight >> >> + * write AIO because of network disconnect >> >> + * then anyway IO failover will be triggered. >> >> + */ >> >> + trace_vxhs_co_flush(s->vdisk_guid, ret, errno); >> >> + ret = 0; >> >> + } >> >> + >> >> + return ret; >> >> +} >> >> + >> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s) >> >> +{ >> >> + void *ctx = NULL; >> >> + int flags = 0; >> >> + unsigned long vdisk_size = 0; >> >> + int ret = 0; >> >> + >> >> + ret = qemu_iio_ioctl(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + VDISK_STAT, &vdisk_size, ctx, flags); >> >> + >> >> + if (ret < 0) { >> >> + trace_vxhs_get_vdisk_stat_err(s->vdisk_guid, ret, errno); >> >> + return 0; >> >> + } >> >> + >> >> + trace_vxhs_get_vdisk_stat(s->vdisk_guid, vdisk_size); >> >> + return vdisk_size; >> >> +} >> >> + >> >> +/* >> >> + * Returns the size of vDisk in bytes. This is required >> >> + * by QEMU block upper block layer so that it is visible >> >> + * to guest. >> >> + */ >> >> +int64_t vxhs_getlength(BlockDriverState *bs) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + unsigned long vdisk_size = 0; >> >> + >> >> + if (s->vdisk_size > 0) { >> >> + vdisk_size = s->vdisk_size; >> > >> > s->vdisk_size is int64_t, vdisk_size is unsigned long. On 32-bit >> > platforms, >> > I believe unsigned long is only 32-bits, so this assignment is problematic. >> > >> > What is the maximum disk size for vxhs? >> > >> > (this comment applies to all the vdisk_size usage, but I'll only mention it >> > here) >> > >> >> Changed to int64_t in all places. The code does not place any >> restriction on the maximum size of the vdisk. >> >> >> + } else { >> >> + /* >> >> + * Fetch the vDisk size using stat ioctl >> >> + */ >> >> + vdisk_size = vxhs_get_vdisk_stat(s); >> >> + if (vdisk_size > 0) { >> >> + s->vdisk_size = vdisk_size; >> >> + } >> >> + } >> >> + >> >> + if (vdisk_size > 0) { >> >> + return (int64_t)vdisk_size; /* return size in bytes */ >> >> + } else { >> >> + return -EIO; >> >> + } >> >> +} >> >> + >> >> +/* >> >> + * Returns actual blocks allocated for the vDisk. >> >> + * This is required by qemu-img utility. >> >> + */ >> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + unsigned long vdisk_size = 0; >> >> + >> >> + if (s->vdisk_size > 0) { >> >> + vdisk_size = s->vdisk_size; >> >> + } else { >> >> + /* >> >> + * TODO: >> >> + * Once HyperScale storage-virtualizer provides >> >> + * actual physical allocation of blocks then >> >> + * fetch that information and return back to the >> >> + * caller but for now just get the full size. >> >> + */ >> >> + vdisk_size = vxhs_get_vdisk_stat(s); >> >> + if (vdisk_size > 0) { >> >> + s->vdisk_size = vdisk_size; >> >> + } >> >> + } >> >> + >> >> + if (vdisk_size > 0) { >> >> + return (int64_t)vdisk_size; /* return size in bytes */ >> >> + } else { >> >> + return -EIO; >> >> + } >> >> +} >> >> + >> >> +void vxhs_close(BlockDriverState *bs) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + >> >> + close(s->fds[VDISK_FD_READ]); >> >> + close(s->fds[VDISK_FD_WRITE]); >> >> + >> >> + /* >> >> + * never close channel - not ref counted, will >> >> + * close for all vdisks >> >> + */ >> >> + if (s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd >= 0) { >> >> + qemu_iio_devclose(s->qnio_ctx, 0, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd); >> >> + } >> >> + if (s->vdisk_lock) { >> >> + VXHS_SPIN_LOCK_DESTROY(s->vdisk_lock); >> >> + s->vdisk_lock = NULL; >> >> + } >> >> + if (s->vdisk_acb_lock) { >> >> + VXHS_SPIN_LOCK_DESTROY(s->vdisk_acb_lock); >> >> + s->vdisk_acb_lock = NULL; >> >> + } >> >> + >> >> + /* >> >> + * TODO: Verify that all the resources were relinguished. >> >> + */ >> > >> > Which resources? Allocated here in the driver, or stuff in libqnio? >> > >> >> The comment was for all resources allocated within vxhs.c. >> Added code to free all allocated memory and fds on close. >> >> >> >> +} >> >> + >> >> +/* >> >> + * If errors are consistent with storage agent failure: >> >> + * - Try to reconnect in case error is transient or storage agent >> >> restarted. >> >> + * - Currently failover is being triggered on per vDisk basis. There is >> >> + * a scope of further optimization where failover can be global (per >> >> VM). >> >> + * - In case of network (storage agent) failure, for all the vDisks, >> >> having >> >> + * no redundancy, I/Os will be failed without attempting for I/O >> >> failover >> >> + * because of stateless nature of vDisk. >> >> + * - If local or source storage agent is down then send an ioctl to >> >> remote >> >> + * storage agent to check if remote storage agent in a state to accept >> >> + * application I/Os. >> >> + * - Once remote storage agent is ready to accept I/O, start I/O >> >> shipping. >> >> + * - If I/Os cannot be serviced then vDisk will be marked failed so that >> >> + * new incoming I/Os are returned with failure immediately. >> >> + * - If vDisk I/O failover is in progress then all new/inflight I/Os >> >> will >> >> + * queued and will be restarted or failed based on failover operation >> >> + * is successful or not. >> >> + * - I/O failover can be started either in I/O forward or I/O backward >> >> + * path. >> >> + * - I/O failover will be started as soon as all the pending acb(s) >> >> + * are queued and there is no pending I/O count. >> >> + * - If I/O failover couldn't be completed within >> >> QNIO_CONNECT_TIMOUT_SECS >> >> + * then vDisk will be marked failed and all I/Os will be completed >> >> with >> >> + * error. >> >> + */ >> >> + >> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s) >> >> +{ >> >> + int res = 0; >> >> + int flags = (IIO_FLAG_ASYNC | IIO_FLAG_DONE); >> >> + >> >> + trace_vxhs_switch_storage_agent( >> >> + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip, >> >> + s->vdisk_guid); >> >> + >> >> + res = vxhs_reopen_vdisk(s, s->vdisk_ask_failover_idx); >> >> + if (res == 0) { >> >> + res = qemu_iio_ioctl(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].vdisk_rfd, >> >> + VDISK_CHECK_IO_FAILOVER_READY, NULL, s, flags); >> >> + } else { >> >> + trace_vxhs_switch_storage_agent_failed( >> >> + s->vdisk_hostinfo[s->vdisk_ask_failover_idx].hostip, >> >> + s->vdisk_guid, res, errno); >> >> + /* >> >> + * TODO: calling vxhs_failover_ioctl_cb from here ties up the >> >> qnio epoll >> >> + * loop if qemu_iio_ioctl fails synchronously (-1) for all hosts >> >> in io >> >> + * target list. >> >> + */ >> >> + >> >> + /* try next host */ >> >> + vxhs_failover_ioctl_cb(res, s); >> >> + } >> >> + return res; >> >> +} >> >> + >> >> +void vxhs_failover_ioctl_cb(int res, void *ctx) >> >> +{ >> >> + BDRVVXHSState *s = ctx; >> >> + >> >> + if (res == 0) { >> >> + /* found failover target */ >> >> + s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx; >> >> + s->vdisk_ask_failover_idx = 0; >> >> + trace_vxhs_failover_ioctl_cb( >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, >> >> + s->vdisk_guid); >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + vxhs_handle_queued_ios(s); >> >> + } else { >> >> + /* keep looking */ >> >> + trace_vxhs_failover_ioctl_cb_retry(s->vdisk_guid); >> >> + s->vdisk_ask_failover_idx++; >> >> + if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) { >> > >> > This function, vxhs_failover_ioctrl_cb(), is called from the external >> > library callback, in addition to QEMU itself (i.e., vxhs_aio_rw). This >> > makes me concerned that we may have (or be able to induce) an unsafe race >> > condition when incrementing the vdisk_ask_failover_id array index. >> > >> >> vxhs_failover_ioctl_cb() can be called from the callback >> vxhs_iio_callback() case IRP_VDISK_CHECK_IO_FAILOVER_READY in response >> to the qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) >> >> This ioctl qnio_iio_ioctl(VDISK_CHECK_IO_FAILOVER_READY) is only >> called from within vxhs_switch_storage_agent() >> >> A direct call to vxhs_failover_ioctl_cb() is only issued from the same >> function vxhs_switch_storage_agent() on an "else" condition. So >> vxhs_failover_ioctl_cb() can only be called from a single thread >> context either in response to the ioctl, or directly. We should be >> safe here, but thanks for pointing it out! > > > The callback function 'vxhs_iio_callback' also calls > vxhs_failover_ioctl_cb(). > > vxhs_failover_ioctl_cb() is also called from vxhs_aio_rw() by QEMU (via > 'vxds_failover_io'). > > The above two paths make me concerned about the array pointer being > incremented. > > (BTW: 'vxhs_failover_ioctl_cb' seems to be misnamed, because it is not a > callback function, but just a helper function). > > Also related: this code reads like it will hang QEMU if QNIO goes offline; > we will be caught in perpetual retries, in which case vxhs_aio_rw() may > never return. > >> >> >> + /* pause and cycle through list again */ >> >> + sleep(QNIO_CONNECT_RETRY_SECS); >> >> + s->vdisk_ask_failover_idx = 0; >> >> + } >> >> + res = vxhs_switch_storage_agent(s); >> >> + } >> >> +} >> >> + >> >> +int vxhs_failover_io(BDRVVXHSState *s) >> >> +{ >> >> + int res = 0; >> >> + >> >> + trace_vxhs_failover_io(s->vdisk_guid); >> >> + >> >> + s->vdisk_ask_failover_idx = 0; >> >> + res = vxhs_switch_storage_agent(s); >> >> + >> >> + return res; >> >> +} >> >> + >> >> +/* >> >> + * Try to reopen the vDisk on one of the available hosts >> >> + * If vDisk reopen is successful on any of the host then >> >> + * check if that node is ready to accept I/O. >> >> + */ >> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int index) >> >> +{ >> >> + char *of_vsa_addr = NULL; >> >> + char *file_name = NULL; >> >> + int res = 0; >> >> + >> >> + /* >> >> + * Don't close the channel if it was opened >> >> + * before successfully. It will be handled >> >> + * within iio* api if the same channel open >> >> + * fd is reused. >> >> + * >> >> + * close stale vdisk device remote fd since >> >> + * it is invalid after channel disconnect. >> >> + */ >> >> + if (s->vdisk_hostinfo[index].vdisk_rfd >= 0) { >> >> + qemu_iio_devclose(s->qnio_ctx, 0, >> >> + s->vdisk_hostinfo[index].vdisk_rfd); >> >> + s->vdisk_hostinfo[index].vdisk_rfd = -1; >> >> + } >> >> + >> >> + /* >> >> + * build storage agent address and vdisk device name strings >> >> + */ >> >> + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); >> >> + of_vsa_addr = g_strdup_printf("of://%s:%d", >> >> + s->vdisk_hostinfo[index].hostip, >> >> s->vdisk_hostinfo[index].port); >> >> + /* >> >> + * open qnio channel to storage agent if not opened before. >> >> + */ >> >> + if (s->vdisk_hostinfo[index].qnio_cfd < 0) { >> >> + s->vdisk_hostinfo[index].qnio_cfd = >> >> + qemu_open_iio_conn(global_qnio_ctx, of_vsa_addr, 0); >> >> + if (s->vdisk_hostinfo[index].qnio_cfd < 0) { >> >> + trace_vxhs_reopen_vdisk(s->vdisk_hostinfo[index].hostip); >> >> + res = ENODEV; >> >> + goto out; >> >> + } >> >> + } >> >> + /* >> >> + * open vdisk device >> >> + */ >> >> + s->vdisk_hostinfo[index].vdisk_rfd = >> >> + qemu_iio_devopen(global_qnio_ctx, >> >> + s->vdisk_hostinfo[index].qnio_cfd, file_name, 0); >> >> + if (s->vdisk_hostinfo[index].vdisk_rfd < 0) { >> >> + trace_vxhs_reopen_vdisk_openfail(file_name); >> >> + res = EIO; >> >> + goto out; >> >> + } >> >> + >> >> +out: >> >> + g_free(of_vsa_addr); >> >> + g_free(file_name); >> >> + return res; >> >> +} >> >> + >> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s) >> >> +{ >> >> + VXHSAIOCB *acb = NULL; >> >> + int res = 0; >> >> + >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + while ((acb = QSIMPLEQ_FIRST(&s->vdisk_aio_retryq)) != NULL) { >> >> + /* >> >> + * Before we process the acb, check whether I/O failover >> >> + * started again due to failback or cascading failure. >> >> + */ >> >> + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto out; >> >> + } >> >> + QSIMPLEQ_REMOVE_HEAD(&s->vdisk_aio_retryq, retry_entry); >> >> + s->vdisk_aio_retry_qd--; >> >> + OF_AIOCB_FLAGS_RESET_QUEUED(acb); >> >> + if (OF_VDISK_FAILED(s)) { >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + vxhs_fail_aio(acb, EIO); >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + } else { >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + res = vxhs_restart_aio(acb); >> >> + trace_vxhs_handle_queued_ios(acb, res); >> >> + VXHS_SPIN_LOCK(s->vdisk_lock); >> >> + if (res) { >> >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, >> >> + acb, retry_entry); >> >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> + goto out; >> >> + } >> >> + } >> >> + } >> >> + VXHS_SPIN_UNLOCK(s->vdisk_lock); >> >> +out: >> >> + return res; >> >> +} >> >> + >> >> +int vxhs_restart_aio(VXHSAIOCB *acb) >> >> +{ >> >> + BDRVVXHSState *s = NULL; >> >> + int iio_flags = 0; >> >> + int res = 0; >> >> + >> >> + s = acb->common.bs->opaque; >> >> + >> >> + if (acb->direction == VDISK_AIO_WRITE) { >> >> + vxhs_inc_vdisk_iocount(s, 1); >> >> + vxhs_inc_acb_segment_count(acb, 1); >> >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); >> >> + res = qemu_iio_writev(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + acb->qiov->iov, acb->qiov->niov, >> >> + acb->io_offset, (void *)acb, iio_flags); >> >> + } >> >> + >> >> + if (acb->direction == VDISK_AIO_READ) { >> >> + vxhs_inc_vdisk_iocount(s, 1); >> >> + vxhs_inc_acb_segment_count(acb, 1); >> >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); >> >> + res = qemu_iio_readv(s->qnio_ctx, >> >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd, >> >> + acb->qiov->iov, acb->qiov->niov, >> >> + acb->io_offset, (void *)acb, iio_flags); >> >> + } >> >> + >> >> + if (res != 0) { >> >> + vxhs_dec_vdisk_iocount(s, 1); >> >> + vxhs_dec_acb_segment_count(acb, 1); >> >> + trace_vxhs_restart_aio(acb->direction, res, errno); >> >> + } >> >> + >> >> + return res; >> >> +} >> >> + >> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err) >> >> +{ >> >> + BDRVVXHSState *s = NULL; >> >> + int segcount = 0; >> >> + int rv = 0; >> >> + >> >> + s = acb->common.bs->opaque; >> >> + >> >> + trace_vxhs_fail_aio(s->vdisk_guid, acb); >> >> + if (!acb->ret) { >> >> + acb->ret = err; >> >> + } >> >> + VXHS_SPIN_LOCK(s->vdisk_acb_lock); >> >> + segcount = acb->segments; >> >> + VXHS_SPIN_UNLOCK(s->vdisk_acb_lock); >> >> + if (segcount == 0) { >> >> + /* >> >> + * Complete the io request >> >> + */ >> >> + rv = qemu_write_full(s->fds[VDISK_FD_WRITE], &acb, sizeof(acb)); >> >> + if (rv != sizeof(acb)) { >> >> + error_report("VXHS AIO completion failed: %s", >> >> + strerror(errno)); >> >> + abort(); >> >> + } >> >> + } >> >> +} >> >> + >> >> +static void vxhs_detach_aio_context(BlockDriverState *bs) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + >> >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], >> >> + false, NULL, NULL, NULL); >> >> + >> >> +} >> >> + >> >> +static void vxhs_attach_aio_context(BlockDriverState *bs, >> >> + AioContext *new_context) >> >> +{ >> >> + BDRVVXHSState *s = bs->opaque; >> >> + >> >> + aio_set_fd_handler(new_context, s->fds[VDISK_FD_READ], >> >> + false, vxhs_aio_event_reader, NULL, s); >> >> +} >> >> + >> >> +static BlockDriver bdrv_vxhs = { >> >> + .format_name = "vxhs", >> >> + .protocol_name = "vxhs", >> >> + .instance_size = sizeof(BDRVVXHSState), >> >> + .bdrv_file_open = vxhs_open, >> >> + .bdrv_parse_filename = vxhs_parse_filename, >> >> + .bdrv_close = vxhs_close, >> >> + .bdrv_getlength = vxhs_getlength, >> >> + .bdrv_get_allocated_file_size = vxhs_get_allocated_blocks, >> >> + .bdrv_aio_readv = vxhs_aio_readv, >> >> + .bdrv_aio_writev = vxhs_aio_writev, >> >> + .bdrv_co_flush_to_disk = vxhs_co_flush, >> >> + .bdrv_detach_aio_context = vxhs_detach_aio_context, >> >> + .bdrv_attach_aio_context = vxhs_attach_aio_context, >> >> +}; >> >> + >> >> +void bdrv_vxhs_init(void) >> >> +{ >> >> + trace_vxhs_bdrv_init('.'); >> >> + bdrv_register(&bdrv_vxhs); >> >> +} >> >> + >> >> +block_init(bdrv_vxhs_init); >> >> diff --git a/block/vxhs.h b/block/vxhs.h >> >> new file mode 100644 >> >> index 0000000..38e0165 >> >> --- /dev/null >> >> +++ b/block/vxhs.h >> >> @@ -0,0 +1,237 @@ >> >> +/* >> >> + * QEMU Block driver for Veritas HyperScale (VxHS) >> >> + * >> >> + * This work is licensed under the terms of the GNU GPL, version 2 or >> >> later. >> >> + * See the COPYING file in the top-level directory. >> >> + * >> >> + */ >> >> + >> >> +#ifndef VXHSD_H >> >> +#define VXHSD_H >> >> + >> >> +#include <gmodule.h> >> >> +#include <inttypes.h> >> >> +#include <pthread.h> >> >> + >> >> +#include "qemu/osdep.h" >> >> +#include "qapi/error.h" >> >> +#include "qemu/error-report.h" >> >> +#include "block/block_int.h" >> >> +#include "qemu/uri.h" >> >> +#include "qemu/queue.h" >> >> + >> >> +#define OF_GUID_STR_LEN 40 >> >> +#define OF_GUID_STR_SZ (OF_GUID_STR_LEN + 1) >> >> +#define QNIO_CONNECT_RETRY_SECS 5 >> >> +#define QNIO_CONNECT_TIMOUT_SECS 120 >> >> + >> >> +/* constants from io_qnio.h */ >> >> +#define IIO_REASON_DONE 0x00000004 >> >> +#define IIO_REASON_EVENT 0x00000008 >> >> +#define IIO_REASON_HUP 0x00000010 >> >> + >> >> +/* >> >> + * IO specific flags >> >> + */ >> >> +#define IIO_FLAG_ASYNC 0x00000001 >> >> +#define IIO_FLAG_DONE 0x00000010 >> >> +#define IIO_FLAG_SYNC 0 >> >> + >> >> +/* constants from error.h */ >> >> +#define VXERROR_RETRY_ON_SOURCE 44 >> >> +#define VXERROR_HUP 901 >> >> +#define VXERROR_CHANNEL_HUP 903 >> >> + >> >> +/* constants from iomgr.h and opcode.h */ >> >> +#define IRP_READ_REQUEST 0x1FFF >> >> +#define IRP_WRITE_REQUEST 0x2FFF >> >> +#define IRP_VDISK_CHECK_IO_FAILOVER_READY 2020 >> >> + >> >> +/* Lock specific macros */ >> >> +#define VXHS_SPIN_LOCK_ALLOC \ >> >> + (qemu_ck_initialize_lock()) >> >> +#define VXHS_SPIN_LOCK(lock) \ >> >> + (qemu_ck_spin_lock(lock)) >> >> +#define VXHS_SPIN_UNLOCK(lock) \ >> >> + (qemu_ck_spin_unlock(lock)) >> >> +#define VXHS_SPIN_LOCK_DESTROY(lock) \ >> >> + (qemu_ck_destroy_lock(lock)) >> > >> > What's the value of these macros, rather than just calling the underlying >> > function directly in code? >> > >> >> This way we can easily swap out CK locks with something else like >> pthread_mutex if required. >> >> >> + >> >> +typedef enum { >> >> + VXHS_IO_INPROGRESS, >> >> + VXHS_IO_COMPLETED, >> >> + VXHS_IO_ERROR >> >> +} VXHSIOState; >> >> + >> >> + >> >> +typedef void (*qnio_callback_t)(ssize_t retval, void *arg); >> >> + >> >> +#define VDISK_FD_READ 0 >> >> +#define VDISK_FD_WRITE 1 >> >> + >> >> +#define QNIO_VDISK_NONE 0x00 >> >> +#define QNIO_VDISK_CREATE 0x01 >> >> + >> >> +/* max IO size supported by QEMU NIO lib */ >> >> +#define QNIO_MAX_IO_SIZE 4194304 >> >> + >> >> +#define MAX_HOSTS 4 >> >> + >> >> +/* >> >> + * Opcodes for making IOCTL on QEMU NIO library >> >> + */ >> >> +#define BASE_OPCODE_SHARED 1000 >> >> +#define BASE_OPCODE_DAL 2000 >> >> +#define IRP_VDISK_STAT (BASE_OPCODE_SHARED + 5) >> >> +#define IRP_VDISK_GET_GEOMETRY (BASE_OPCODE_DAL + 17) >> >> +#define IRP_VDISK_READ_PARTITION (BASE_OPCODE_DAL + 18) >> >> +#define IRP_VDISK_FLUSH (BASE_OPCODE_DAL + 19) >> >> + >> >> +/* >> >> + * BDRVVXHSState specific flags >> >> + */ >> >> +#define OF_VDISK_FLAGS_STATE_ACTIVE 0x0000000000000001 >> >> +#define OF_VDISK_FLAGS_STATE_FAILED 0x0000000000000002 >> >> +#define OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS 0x0000000000000004 >> >> + >> >> +#define OF_VDISK_ACTIVE(s) \ >> >> + ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_ACTIVE) >> >> +#define OF_VDISK_SET_ACTIVE(s) \ >> >> + ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_ACTIVE) >> >> +#define OF_VDISK_RESET_ACTIVE(s) \ >> >> + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_ACTIVE) >> >> + >> >> +#define OF_VDISK_FAILED(s) \ >> >> + ((s)->vdisk_flags & OF_VDISK_FLAGS_STATE_FAILED) >> >> +#define OF_VDISK_SET_FAILED(s) \ >> >> + ((s)->vdisk_flags |= OF_VDISK_FLAGS_STATE_FAILED) >> >> +#define OF_VDISK_RESET_FAILED(s) \ >> >> + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_STATE_FAILED) >> >> + >> >> +#define OF_VDISK_IOFAILOVER_IN_PROGRESS(s) \ >> >> + ((s)->vdisk_flags & OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) >> >> +#define OF_VDISK_SET_IOFAILOVER_IN_PROGRESS(s) \ >> >> + ((s)->vdisk_flags |= OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) >> >> +#define OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s) \ >> >> + ((s)->vdisk_flags &= ~OF_VDISK_FLAGS_IOFAILOVER_IN_PROGRESS) >> >> + >> >> +/* >> >> + * VXHSAIOCB specific flags >> >> + */ >> >> +#define OF_ACB_QUEUED 0x00000001 >> >> + >> >> +#define OF_AIOCB_FLAGS_QUEUED(a) \ >> >> + ((a)->flags & OF_ACB_QUEUED) >> >> +#define OF_AIOCB_FLAGS_SET_QUEUED(a) \ >> >> + ((a)->flags |= OF_ACB_QUEUED) >> >> +#define OF_AIOCB_FLAGS_RESET_QUEUED(a) \ >> >> + ((a)->flags &= ~OF_ACB_QUEUED) >> >> + >> > >> > Many helper macros here... IMO, it decreases the readability of the code, >> > and my preference is to just have the underlying code without the use of >> > the >> > macros. >> > >> >> Would like to keep this unchanged if that's OK! >> >> >> +typedef struct qemu2qnio_ctx { >> >> + uint32_t qnio_flag; >> >> + uint64_t qnio_size; >> >> + char *qnio_channel; >> >> + char *target; >> >> + qnio_callback_t qnio_cb; >> >> +} qemu2qnio_ctx_t; >> >> + >> >> +typedef qemu2qnio_ctx_t qnio2qemu_ctx_t; >> >> + >> >> +typedef struct LibQNIOSymbol { >> >> + const char *name; >> >> + gpointer *addr; >> >> +} LibQNIOSymbol; >> >> + >> >> +typedef void (*iio_cb_t) (uint32_t rfd, uint32_t reason, void *ctx, >> >> + void *reply); >> >> + >> >> +/* >> >> + * HyperScale AIO callbacks structure >> >> + */ >> >> +typedef struct VXHSAIOCB { >> >> + BlockAIOCB common; >> >> + size_t ret; >> >> + size_t size; >> >> + QEMUBH *bh; >> >> + int aio_done; >> >> + int segments; >> >> + int flags; >> >> + size_t io_offset; >> >> + QEMUIOVector *qiov; >> >> + void *buffer; >> >> + int direction; /* IO direction (r/w) */ >> >> + QSIMPLEQ_ENTRY(VXHSAIOCB) retry_entry; >> >> +} VXHSAIOCB; >> >> + >> >> +typedef struct VXHSvDiskHostsInfo { >> >> + int qnio_cfd; /* Channel FD */ >> >> + int vdisk_rfd; /* vDisk remote FD */ >> >> + char *hostip; /* Host's IP addresses */ >> >> + int port; /* Host's port number */ >> >> +} VXHSvDiskHostsInfo; >> >> + >> >> +/* >> >> + * Structure per vDisk maintained for state >> >> + */ >> >> +typedef struct BDRVVXHSState { >> >> + int fds[2]; >> >> + int64_t vdisk_size; >> >> + int64_t vdisk_blocks; >> >> + int64_t vdisk_flags; >> >> + int vdisk_aio_count; >> >> + int event_reader_pos; >> >> + VXHSAIOCB *qnio_event_acb; >> >> + void *qnio_ctx; >> >> + void *vdisk_lock; /* Lock to protect >> >> BDRVVXHSState */ >> >> + void *vdisk_acb_lock; /* Protects ACB */ >> >> + VXHSvDiskHostsInfo vdisk_hostinfo[MAX_HOSTS]; /* Per host info >> >> */ >> >> + int vdisk_nhosts; /* Total number of hosts */ >> >> + int vdisk_cur_host_idx; /* IOs are being shipped >> >> to */ >> >> + int vdisk_ask_failover_idx; /*asking permsn to >> >> ship io*/ >> >> + QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq; >> >> + int vdisk_aio_retry_qd; >> > >> > Is vdisk_aio_retry_qd just used for debugging purposes? I see it >> > incremented/decremented throughout the code, but no one checks it. >> > >> >> Yes, it is for debugging only. Added a comment to state the same. >> >> >> >> + char *vdisk_guid; >> >> +} BDRVVXHSState; >> >> + >> >> +void bdrv_vxhs_init(void); >> >> +void *vxhs_setup_qnio(void); >> >> +void vxhs_iio_callback(uint32_t rfd, uint32_t reason, void *ctx, void >> >> *m); >> >> +void vxhs_aio_event_reader(void *opaque); >> >> +void vxhs_complete_aio(VXHSAIOCB *acb, BDRVVXHSState *s); >> >> +int vxhs_aio_flush_cb(void *opaque); >> >> +void vxhs_finish_aiocb(ssize_t ret, void *arg); >> >> +unsigned long vxhs_get_vdisk_stat(BDRVVXHSState *s); >> >> +int vxhs_open(BlockDriverState *bs, QDict *options, >> >> + int bdrv_flags, Error **errp); >> >> +void vxhs_close(BlockDriverState *bs); >> >> +BlockAIOCB *vxhs_aio_readv(BlockDriverState *bs, int64_t sector_num, >> >> + QEMUIOVector *qiov, int nb_sectors, >> >> + BlockCompletionFunc *cb, void >> >> *opaque); >> >> +BlockAIOCB *vxhs_aio_writev(BlockDriverState *bs, int64_t sector_num, >> >> + QEMUIOVector *qiov, int nb_sectors, >> >> + BlockCompletionFunc *cb, >> >> + void *opaque); >> >> +int64_t vxhs_get_allocated_blocks(BlockDriverState *bs); >> >> +BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, int64_t sector_num, >> >> + QEMUIOVector *qiov, int nb_sectors, >> >> + BlockCompletionFunc *cb, >> >> + void *opaque, int write); >> >> +int vxhs_co_flush(BlockDriverState *bs); >> >> +int64_t vxhs_getlength(BlockDriverState *bs); >> >> +void vxhs_inc_vdisk_iocount(void *ptr, uint32_t delta); >> >> +void vxhs_dec_vdisk_iocount(void *ptr, uint32_t delta); >> >> +uint32_t vxhs_get_vdisk_iocount(void *ptr); >> >> +void vxhs_inc_acb_segment_count(void *ptr, int count); >> >> +void vxhs_dec_acb_segment_count(void *ptr, int count); >> >> +int vxhs_dec_and_get_acb_segment_count(void *ptr, int count); >> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer); >> >> +int vxhs_failover_io(BDRVVXHSState *s); >> >> +int vxhs_reopen_vdisk(BDRVVXHSState *s, int hostinfo_index); >> >> +int vxhs_switch_storage_agent(BDRVVXHSState *s); >> >> +int vxhs_handle_queued_ios(BDRVVXHSState *s); >> >> +int vxhs_restart_aio(VXHSAIOCB *acb); >> >> +void vxhs_fail_aio(VXHSAIOCB *acb, int err); >> >> +void vxhs_failover_ioctl_cb(int res, void *ctx); >> >> + >> >> + >> >> +#endif >> >> diff --git a/configure b/configure >> >> index 4b808f9..c71e270 100755 >> >> --- a/configure >> >> +++ b/configure >> >> @@ -320,6 +320,7 @@ vhdx="" >> >> numa="" >> >> tcmalloc="no" >> >> jemalloc="no" >> >> +vxhs="" >> >> >> >> # parse CC options first >> >> for opt do >> >> @@ -1150,6 +1151,11 @@ for opt do >> >> ;; >> >> --enable-jemalloc) jemalloc="yes" >> >> ;; >> >> + --disable-vxhs) vxhs="no" >> >> + ;; >> >> + --enable-vxhs) vxhs="yes" >> >> + ;; >> >> + >> >> *) >> >> echo "ERROR: unknown option $opt" >> >> echo "Try '$0 --help' for more information" >> >> @@ -1380,6 +1386,7 @@ disabled with --disable-FEATURE, default is enabled >> >> if available: >> >> numa libnuma support >> >> tcmalloc tcmalloc support >> >> jemalloc jemalloc support >> >> + vxhs Veritas HyperScale vDisk backend support >> >> >> >> NOTE: The object files are built at the place where configure is launched >> >> EOF >> >> @@ -4543,6 +4550,43 @@ if do_cc -nostdlib -Wl,-r -Wl,--no-relax -o $TMPMO >> >> $TMPO; then >> >> fi >> >> >> >> ########################################## >> >> +# Veritas HyperScale block driver VxHS >> >> +# Check if libqnio is installed >> >> +if test "$vxhs" != "no" ; then >> >> + cat > $TMPC <<EOF >> >> +#include <stdio.h> >> >> +#include <qnio/qnio_api.h> >> >> + >> >> +void vxhs_inc_acb_segment_count(void *acb, int count); >> >> +void vxhs_dec_acb_segment_count(void *acb, int count); >> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer); >> >> + >> >> +void vxhs_inc_acb_segment_count(void *ptr, int count) >> >> +{ >> >> +} >> >> +void vxhs_dec_acb_segment_count(void *ptr, int count) >> >> +{ >> >> +} >> >> +void vxhs_set_acb_buffer(void *ptr, void *buffer) >> >> +{ >> >> +} >> >> +int main(void) { >> >> + qemu_ck_initialize_lock(); >> >> + return 0; >> >> +} >> >> +EOF >> >> + vxhs_libs="-lqnioshim -lqnio" >> >> + if compile_prog "" "$vxhs_libs" ; then >> >> + vxhs=yes >> >> + else >> >> + if test "$vxhs" = "yes" ; then >> >> + feature_not_found "vxhs block device" "Install libqnio. See github" >> >> + fi >> >> + vxhs=no >> >> + fi >> >> +fi >> >> + >> >> +########################################## >> >> # End of CC checks >> >> # After here, no more $cc or $ld runs >> >> >> >> @@ -5488,6 +5532,12 @@ if test "$pthread_setname_np" = "yes" ; then >> >> echo "CONFIG_PTHREAD_SETNAME_NP=y" >> $config_host_mak >> >> fi >> >> >> >> +if test "$vxhs" = "yes" ; then >> >> + echo "CONFIG_VXHS=y" >> $config_host_mak >> >> + echo "VXHS_CFLAGS=$vxhs_cflags" >> $config_host_mak >> >> + echo "VXHS_LIBS=$vxhs_libs" >> $config_host_mak >> >> +fi >> >> + >> >> if test "$tcg_interpreter" = "yes"; then >> >> QEMU_INCLUDES="-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES" >> >> elif test "$ARCH" = "sparc64" ; then >> > >> > Your configure file should indicate if vxhs is enabled or not. For >> > instance, >> > here is a snippet from where this occurs in the current configure file: >> > >> > [...] >> > echo "NUMA host support $numa" >> > echo "tcmalloc support $tcmalloc" >> > echo "jemalloc support $jemalloc" >> > echo "avx2 optimization $avx2_opt" >> > >> > >> >> Fixed. >> >> >> -- >> >> 2.5.5 >> >> >> >>