Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

Stefan Hajnoczi Mon, 14 Nov 2016 07:07:38 -0800

On Wed, Sep 28, 2016 at 10:45 PM, Stefan Hajnoczi <stefa...@gmail.com> wrote:
> On Tue, Sep 27, 2016 at 09:09:49PM -0700, Ashish Mittal wrote:
>
> Review of .bdrv_open() and .bdrv_aio_writev() code paths.
>
> The big issues I see in this driver and libqnio:
>
> 1. Showstoppers like broken .bdrv_open() and leaking memory on every
>    reply message.
> 2. Insecure due to missing input validation (network packets and
>    configuration) and incorrect string handling.
> 3. Not fully asynchronous so QEMU and the guest may hang.
>
> Please think about the whole codebase and not just the lines I've
> pointed out in this review when fixing these sorts of issues.  There may
> be similar instances of these bugs elsewhere and it's important that
> they are fixed so that this can be merged.


Ping?

You didn't respond to the comments I raised.  The libqnio buffer
overflows and other issues from this email are still present.

I put a lot of time into reviewing this patch series and libqnio.  If
you want to get reviews please address feedback before sending a new
patch series.

>
>> +/*
>> + * Structure per vDisk maintained for state
>> + */
>> +typedef struct BDRVVXHSState {
>> +    int                     fds[2];
>> +    int64_t                 vdisk_size;
>> +    int64_t                 vdisk_blocks;
>> +    int64_t                 vdisk_flags;
>> +    int                     vdisk_aio_count;
>> +    int                     event_reader_pos;
>> +    VXHSAIOCB               *qnio_event_acb;
>> +    void                    *qnio_ctx;
>> +    QemuSpin                vdisk_lock; /* Lock to protect BDRVVXHSState */
>> +    QemuSpin                vdisk_acb_lock;  /* Protects ACB */
>
> These comments are insufficient for documenting locking.  Not all fields
> are actually protected by these locks.  Please order fields according to
> lock coverage:
>
> typedef struct VXHSAIOCB {
>     ...
>
>     /* Protected by BDRVVXHSState->vdisk_acb_lock */
>     int                 segments;
>     ...
> };
>
> typedef struct BDRVVXHSState {
>     ...
>
>     /* Protected by vdisk_lock */
>     QemuSpin                vdisk_lock;
>     int                     vdisk_aio_count;
>     QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq;
>     ...
> }
>
>> +static void vxhs_qnio_iio_close(BDRVVXHSState *s, int idx)
>> +{
>> +    /*
>> +     * Close vDisk device
>> +     */
>> +    if (s->vdisk_hostinfo[idx].vdisk_rfd >= 0) {
>> +        iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo[idx].vdisk_rfd);
>
> libqnio comment:
> Why does iio_devclose() take an unused cfd argument?  Perhaps it can be
> dropped.
>
>> +        s->vdisk_hostinfo[idx].vdisk_rfd = -1;
>> +    }
>> +
>> +    /*
>> +     * Close QNIO channel against cached channel-fd
>> +     */
>> +    if (s->vdisk_hostinfo[idx].qnio_cfd >= 0) {
>> +        iio_close(s->qnio_ctx, s->vdisk_hostinfo[idx].qnio_cfd);
>
> libqnio comment:
> Why does iio_devclose() take an int32_t cfd argument but iio_close()
> takes a uint32_t cfd argument?
>
>> +        s->vdisk_hostinfo[idx].qnio_cfd = -1;
>> +    }
>> +}
>> +
>> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr,
>> +                              int *rfd, const char *file_name)
>> +{
>> +    /*
>> +     * Open qnio channel to storage agent if not opened before.
>> +     */
>> +    if (*cfd < 0) {
>> +        *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0);
>
> libqnio comments:
>
> 1.
> There is a buffer overflow in qnio_create_channel().  strncpy() is used
> incorrectly so long hostname or port (both can be 99 characters long)
> will overflow channel->name[] (64 characters) or channel->port[] (8
> characters).
>
>     strncpy(channel->name, hostname, strlen(hostname) + 1);
>     strncpy(channel->port, port, strlen(port) + 1);
>
> The third argument must be the size of the *destination* buffer, not the
> source buffer.  Also note that strncpy() doesn't NUL-terminate the
> destination string so you must do that manually to ensure there is a NUL
> byte at the end of the buffer.
>
> 2.
> channel is leaked in the "Failed to open single connection" error case
> in qnio_create_channel().
>
> 3.
> If host is longer the 63 characters then the ioapi_ctx->channels and
> qnio_ctx->channels maps will use different keys due to string truncation
> in qnio_create_channel().  This means "Channel already exists" in
> qnio_create_channel() and possibly other things will not work as
> expected.
>
>> +        if (*cfd < 0) {
>> +            trace_vxhs_qnio_iio_open(of_vsa_addr);
>> +            return -ENODEV;
>> +        }
>> +    }
>> +
>> +    /*
>> +     * Open vdisk device
>> +     */
>> +    *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0);
>
> libqnio comment:
> Buffer overflow in iio_devopen() since chandev[128] is not large enough
> to hold channel[100] + " " + devpath[arbitrary length] chars:
>
>     sprintf(chandev, "%s %s", channel, devpath);
>
>> +
>> +    if (*rfd < 0) {
>> +        if (*cfd >= 0) {
>
> This check is always true.  Otherwise the return -ENODEV would have been
> taken above.  The if statement isn't necessary.
>
>> +static void vxhs_check_failover_status(int res, void *ctx)
>> +{
>> +    BDRVVXHSState *s = ctx;
>> +
>> +    if (res == 0) {
>> +        /* found failover target */
>> +        s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx;
>> +        s->vdisk_ask_failover_idx = 0;
>> +        trace_vxhs_check_failover_status(
>> +                   s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                   s->vdisk_guid);
>> +        qemu_spin_lock(&s->vdisk_lock);
>> +        OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        vxhs_handle_queued_ios(s);
>> +    } else {
>> +        /* keep looking */
>> +        trace_vxhs_check_failover_status_retry(s->vdisk_guid);
>> +        s->vdisk_ask_failover_idx++;
>> +        if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) {
>> +            /* pause and cycle through list again */
>> +            sleep(QNIO_CONNECT_RETRY_SECS);
>
> This code is called from a QEMU thread via vxhs_aio_rw().  It is not
> permitted to call sleep() since it will freeze QEMU and probably the
> guest.
>
> If you need a timer you can use QEMU's timer APIs.  See aio_timer_new(),
> timer_new_ns(), timer_mod(), timer_del(), timer_free().
>
>> +            s->vdisk_ask_failover_idx = 0;
>> +        }
>> +        res = vxhs_switch_storage_agent(s);
>> +    }
>> +}
>> +
>> +static int vxhs_failover_io(BDRVVXHSState *s)
>> +{
>> +    int res = 0;
>> +
>> +    trace_vxhs_failover_io(s->vdisk_guid);
>> +
>> +    s->vdisk_ask_failover_idx = 0;
>> +    res = vxhs_switch_storage_agent(s);
>> +
>> +    return res;
>> +}
>> +
>> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx,
>> +                       uint32_t error, uint32_t opcode)
>
> This function is doing too much.  Especially the failover code should
> run in the AioContext since it's complex.  Don't do failover here
> because this function is outside the AioContext lock.  Do it from
> AioContext using a QEMUBH like block/rbd.c.
>
>> +static int32_t
>> +vxhs_qnio_iio_writev(void *qnio_ctx, uint32_t rfd, QEMUIOVector *qiov,
>> +                     uint64_t offset, void *ctx, uint32_t flags)
>> +{
>> +    struct iovec cur;
>> +    uint64_t cur_offset = 0;
>> +    uint64_t cur_write_len = 0;
>> +    int segcount = 0;
>> +    int ret = 0;
>> +    int i, nsio = 0;
>> +    int iovcnt = qiov->niov;
>> +    struct iovec *iov = qiov->iov;
>> +
>> +    errno = 0;
>> +    cur.iov_base = 0;
>> +    cur.iov_len = 0;
>> +
>> +    ret = iio_writev(qnio_ctx, rfd, iov, iovcnt, offset, ctx, flags);
>
> libqnio comments:
>
> 1.
> There are blocking connect(2) and getaddrinfo(3) calls in iio_writev()
> so this may hang for arbitrary amounts of time.  This is not permitted
> in .bdrv_aio_readv()/.bdrv_aio_writev().  Please make qnio actually
> asynchronous.
>
> 2.
> Where does client_callback() free reply?  It looks like every reply
> message causes a memory leak!
>
> 3.
> Buffer overflow in iio_writev() since device[128] cannot fit the device
> string generated from the vdisk_guid.
>
> 4.
> Buffer overflow in iio_writev() due to
> strncpy(msg->hinfo.target,device,strlen(device)) where device[128] is
> larger than target[64].  Also note the previous comments about strncpy()
> usage.
>
> 5.
> I don't see any endianness handling or portable alignment of struct
> fields in the network protocol code.  Binary network protocols need to
> take care of these issue for portability.  This means libqnio compiled
> for different architectures will not work.  Do you plan to support any
> other architectures besides x86?
>
> 6.
> The networking code doesn't look robust: kvset uses assert() on input
> from the network so the other side of the connection could cause SIGABRT
> (coredump), the client uses the msg pointer as the cookie for the
> response packet so the server can easily crash the client by sending a
> bogus cookie value, etc.  Even on the client side these things are
> troublesome but on a server they are guaranteed security issues.  I
> didn't look into it deeply.  Please audit the code.
>
>> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s,
>> +                              int *cfd, int *rfd, Error **errp)
>> +{
>> +    QDict *backing_options = NULL;
>> +    QemuOpts *opts, *tcp_opts;
>> +    const char *vxhs_filename;
>> +    char *of_vsa_addr = NULL;
>> +    Error *local_err = NULL;
>> +    const char *vdisk_id_opt;
>> +    char *file_name = NULL;
>> +    size_t num_servers = 0;
>> +    char *str = NULL;
>> +    int ret = 0;
>> +    int i;
>> +
>> +    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
>> +    qemu_opts_absorb_qdict(opts, options, &local_err);
>> +    if (local_err) {
>> +        error_propagate(errp, local_err);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +
>> +    vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME);
>> +    if (vxhs_filename) {
>> +        trace_vxhs_qemu_init_filename(vxhs_filename);
>> +    }
>> +
>> +    vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID);
>> +    if (!vdisk_id_opt) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    s->vdisk_guid = g_strdup(vdisk_id_opt);
>> +    trace_vxhs_qemu_init_vdisk(vdisk_id_opt);
>> +
>> +    num_servers = qdict_array_entries(options, VXHS_OPT_SERVER);
>> +    if (num_servers < 1) {
>> +        error_setg(&local_err, QERR_MISSING_PARAMETER, "server");
>> +        ret = -EINVAL;
>> +        goto out;
>> +    } else if (num_servers > VXHS_MAX_HOSTS) {
>> +        error_setg(&local_err, QERR_INVALID_PARAMETER, "server");
>> +        error_append_hint(errp, "Maximum %d servers allowed.\n",
>> +                          VXHS_MAX_HOSTS);
>> +        ret = -EINVAL;
>> +        goto out;
>> +    }
>> +    trace_vxhs_qemu_init_numservers(num_servers);
>> +
>> +    for (i = 0; i < num_servers; i++) {
>> +        str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i);
>> +        qdict_extract_subqdict(options, &backing_options, str);
>> +
>> +        /* Create opts info from runtime_tcp_opts list */
>> +        tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, 
>> &error_abort);
>> +        qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err);
>> +        if (local_err) {
>> +            qdict_del(backing_options, str);
>
> backing_options is leaked and there's no need to delete the str key.
>
>> +            qemu_opts_del(tcp_opts);
>> +            g_free(str);
>> +            ret = -EINVAL;
>> +            goto out;
>> +        }
>> +
>> +        s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(tcp_opts,
>> +                                                            VXHS_OPT_HOST));
>> +        s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(tcp_opts,
>> +                                                                 
>> VXHS_OPT_PORT),
>> +                                                    NULL, 0);
>
> This will segfault if the port option was missing.
>
>> +
>> +        s->vdisk_hostinfo[i].qnio_cfd = -1;
>> +        s->vdisk_hostinfo[i].vdisk_rfd = -1;
>> +        trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip,
>> +                             s->vdisk_hostinfo[i].port);
>
> It's not safe to use the %s format specifier for a trace event with a
> NULL value.  In the case where hostip is NULL this could crash on some
> systems.
>
>> +
>> +        qdict_del(backing_options, str);
>> +        qemu_opts_del(tcp_opts);
>> +        g_free(str);
>> +    }
>
> backing_options is leaked.
>
>> +
>> +    s->vdisk_nhosts = i;
>> +    s->vdisk_cur_host_idx = 0;
>> +    file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid);
>> +    of_vsa_addr = g_strdup_printf("of://%s:%d",
>> +                                
>> s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip,
>> +                                
>> s->vdisk_hostinfo[s->vdisk_cur_host_idx].port);
>
> Can we get here with num_servers == 0?  In that case this would access
> uninitialized memory.  I guess num_servers == 0 does not make sense and
> there should be an error case for it.
>
>> +
>> +    /*
>> +     * .bdrv_open() and .bdrv_create() run under the QEMU global mutex.
>> +     */
>> +    if (global_qnio_ctx == NULL) {
>> +        global_qnio_ctx = vxhs_setup_qnio();
>
> libqnio comment:
> The client epoll thread should mask all signals (like
> qemu_thread_create()).  Otherwise it may receive signals that it cannot
> deal with.
>
>> +        if (global_qnio_ctx == NULL) {
>> +            error_setg(&local_err, "Failed vxhs_setup_qnio");
>> +            ret = -EINVAL;
>> +            goto out;
>> +        }
>> +    }
>> +
>> +    ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name);
>> +    if (!ret) {
>> +        error_setg(&local_err, "Failed qnio_iio_open");
>> +        ret = -EIO;
>> +    }
>
> The return value of vxhs_qnio_iio_open() is 0 for success or -errno for
> error.
>
> I guess you never ran this code!  The block driver won't even open
> successfully.
>
>> +
>> +out:
>> +    g_free(file_name);
>> +    g_free(of_vsa_addr);
>> +    qemu_opts_del(opts);
>> +
>> +    if (ret < 0) {
>> +        for (i = 0; i < num_servers; i++) {
>> +            g_free(s->vdisk_hostinfo[i].hostip);
>> +        }
>> +        g_free(s->vdisk_guid);
>> +        s->vdisk_guid = NULL;
>> +        errno = -ret;
>
> There is no need to set errno here.  The return value already contains
> the error and the caller doesn't look at errno.
>
>> +    }
>> +    error_propagate(errp, local_err);
>> +
>> +    return ret;
>> +}
>> +
>> +static int vxhs_open(BlockDriverState *bs, QDict *options,
>> +              int bdrv_flags, Error **errp)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    AioContext *aio_context;
>> +    int qemu_qnio_cfd = -1;
>> +    int device_opened = 0;
>> +    int qemu_rfd = -1;
>> +    int ret = 0;
>> +    int i;
>> +
>> +    ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_fail(ret);
>> +        return ret;
>> +    }
>> +
>> +    device_opened = 1;
>> +    s->qnio_ctx = global_qnio_ctx;
>> +    s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd;
>> +    s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd;
>> +    s->vdisk_size = 0;
>> +    QSIMPLEQ_INIT(&s->vdisk_aio_retryq);
>> +
>> +    /*
>> +     * Create a pipe for communicating between two threads in different
>> +     * context. Set handler for read event, which gets triggered when
>> +     * IO completion is done by non-QEMU context.
>> +     */
>> +    ret = qemu_pipe(s->fds);
>> +    if (ret < 0) {
>> +        trace_vxhs_open_epipe('.');
>> +        ret = -errno;
>> +        goto errout;
>
> This leaks s->vdisk_guid, s->vdisk_hostinfo[i].hostip, etc.
> bdrv_close() will not be called so this function must do cleanup itself.
>
>> +    }
>> +    fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK);
>> +
>> +    aio_context = bdrv_get_aio_context(bs);
>> +    aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ],
>> +                       false, vxhs_aio_event_reader, NULL, s);
>> +
>> +    /*
>> +     * Initialize the spin-locks.
>> +     */
>> +    qemu_spin_init(&s->vdisk_lock);
>> +    qemu_spin_init(&s->vdisk_acb_lock);
>> +
>> +    return 0;
>> +
>> +errout:
>> +    /*
>> +     * Close remote vDisk device if it was opened earlier
>> +     */
>> +    if (device_opened) {
>
> This is always true.  The device_opened variable can be removed.
>
>> +/*
>> + * This allocates QEMU-VXHS callback for each IO
>> + * and is passed to QNIO. When QNIO completes the work,
>> + * it will be passed back through the callback.
>> + */
>> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs,
>> +                                int64_t sector_num, QEMUIOVector *qiov,
>> +                                int nb_sectors,
>> +                                BlockCompletionFunc *cb,
>> +                                void *opaque, int iodir)
>> +{
>> +    VXHSAIOCB *acb = NULL;
>> +    BDRVVXHSState *s = bs->opaque;
>> +    size_t size;
>> +    uint64_t offset;
>> +    int iio_flags = 0;
>> +    int ret = 0;
>> +    void *qnio_ctx = s->qnio_ctx;
>> +    uint32_t rfd = s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd;
>> +
>> +    offset = sector_num * BDRV_SECTOR_SIZE;
>> +    size = nb_sectors * BDRV_SECTOR_SIZE;
>> +
>> +    acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque);
>> +    /*
>> +     * Setup or initialize VXHSAIOCB.
>> +     * Every single field should be initialized since
>> +     * acb will be picked up from the slab without
>> +     * initializing with zero.
>> +     */
>> +    acb->io_offset = offset;
>> +    acb->size = size;
>> +    acb->ret = 0;
>> +    acb->flags = 0;
>> +    acb->aio_done = VXHS_IO_INPROGRESS;
>> +    acb->segments = 0;
>> +    acb->buffer = 0;
>> +    acb->qiov = qiov;
>> +    acb->direction = iodir;
>> +
>> +    qemu_spin_lock(&s->vdisk_lock);
>> +    if (OF_VDISK_FAILED(s)) {
>> +        trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        goto errout;
>> +    }
>> +    if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) {
>> +        QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry);
>> +        s->vdisk_aio_retry_qd++;
>> +        OF_AIOCB_FLAGS_SET_QUEUED(acb);
>> +        qemu_spin_unlock(&s->vdisk_lock);
>> +        trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1);
>> +        goto out;
>> +    }
>> +    s->vdisk_aio_count++;
>> +    qemu_spin_unlock(&s->vdisk_lock);
>> +
>> +    iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC);
>> +
>> +    switch (iodir) {
>> +    case VDISK_AIO_WRITE:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = vxhs_qnio_iio_writev(qnio_ctx, rfd, qiov,
>> +                                       offset, (void *)acb, iio_flags);
>> +            break;
>> +    case VDISK_AIO_READ:
>> +            vxhs_inc_acb_segment_count(acb, 1);
>> +            ret = vxhs_qnio_iio_readv(qnio_ctx, rfd, qiov,
>> +                                       offset, (void *)acb, iio_flags);
>> +            break;
>> +    default:
>> +            trace_vxhs_aio_rw_invalid(iodir);
>> +            goto errout;
>
> s->vdisk_aio_count must be decremented before returning.
>
>> +static void vxhs_close(BlockDriverState *bs)
>> +{
>> +    BDRVVXHSState *s = bs->opaque;
>> +    int i;
>> +
>> +    trace_vxhs_close(s->vdisk_guid);
>> +    close(s->fds[VDISK_FD_READ]);
>> +    close(s->fds[VDISK_FD_WRITE]);
>> +
>> +    /*
>> +     * Clearing all the event handlers for oflame registered to QEMU
>> +     */
>> +    aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ],
>> +                       false, NULL, NULL, NULL);
>
> Please remove the event handler before closing the fd.  I don't think it
> matters in this case but in other scenarios there could be race
> conditions if another thread opens an fd and the file descriptor number
> is reused.

Re: [Qemu-devel] [PATCH v7 RFC] block/vxhs: Initial commit to add Veritas HyperScale VxHS block device support

Reply via email to