On Wed, Sep 28, 2016 at 10:45 PM, Stefan Hajnoczi <stefa...@gmail.com> wrote: > On Tue, Sep 27, 2016 at 09:09:49PM -0700, Ashish Mittal wrote: > > Review of .bdrv_open() and .bdrv_aio_writev() code paths. > > The big issues I see in this driver and libqnio: > > 1. Showstoppers like broken .bdrv_open() and leaking memory on every > reply message. > 2. Insecure due to missing input validation (network packets and > configuration) and incorrect string handling. > 3. Not fully asynchronous so QEMU and the guest may hang. > > Please think about the whole codebase and not just the lines I've > pointed out in this review when fixing these sorts of issues. There may > be similar instances of these bugs elsewhere and it's important that > they are fixed so that this can be merged.
Ping? You didn't respond to the comments I raised. The libqnio buffer overflows and other issues from this email are still present. I put a lot of time into reviewing this patch series and libqnio. If you want to get reviews please address feedback before sending a new patch series. > >> +/* >> + * Structure per vDisk maintained for state >> + */ >> +typedef struct BDRVVXHSState { >> + int fds[2]; >> + int64_t vdisk_size; >> + int64_t vdisk_blocks; >> + int64_t vdisk_flags; >> + int vdisk_aio_count; >> + int event_reader_pos; >> + VXHSAIOCB *qnio_event_acb; >> + void *qnio_ctx; >> + QemuSpin vdisk_lock; /* Lock to protect BDRVVXHSState */ >> + QemuSpin vdisk_acb_lock; /* Protects ACB */ > > These comments are insufficient for documenting locking. Not all fields > are actually protected by these locks. Please order fields according to > lock coverage: > > typedef struct VXHSAIOCB { > ... > > /* Protected by BDRVVXHSState->vdisk_acb_lock */ > int segments; > ... > }; > > typedef struct BDRVVXHSState { > ... > > /* Protected by vdisk_lock */ > QemuSpin vdisk_lock; > int vdisk_aio_count; > QSIMPLEQ_HEAD(aio_retryq, VXHSAIOCB) vdisk_aio_retryq; > ... > } > >> +static void vxhs_qnio_iio_close(BDRVVXHSState *s, int idx) >> +{ >> + /* >> + * Close vDisk device >> + */ >> + if (s->vdisk_hostinfo[idx].vdisk_rfd >= 0) { >> + iio_devclose(s->qnio_ctx, 0, s->vdisk_hostinfo[idx].vdisk_rfd); > > libqnio comment: > Why does iio_devclose() take an unused cfd argument? Perhaps it can be > dropped. > >> + s->vdisk_hostinfo[idx].vdisk_rfd = -1; >> + } >> + >> + /* >> + * Close QNIO channel against cached channel-fd >> + */ >> + if (s->vdisk_hostinfo[idx].qnio_cfd >= 0) { >> + iio_close(s->qnio_ctx, s->vdisk_hostinfo[idx].qnio_cfd); > > libqnio comment: > Why does iio_devclose() take an int32_t cfd argument but iio_close() > takes a uint32_t cfd argument? > >> + s->vdisk_hostinfo[idx].qnio_cfd = -1; >> + } >> +} >> + >> +static int vxhs_qnio_iio_open(int *cfd, const char *of_vsa_addr, >> + int *rfd, const char *file_name) >> +{ >> + /* >> + * Open qnio channel to storage agent if not opened before. >> + */ >> + if (*cfd < 0) { >> + *cfd = iio_open(global_qnio_ctx, of_vsa_addr, 0); > > libqnio comments: > > 1. > There is a buffer overflow in qnio_create_channel(). strncpy() is used > incorrectly so long hostname or port (both can be 99 characters long) > will overflow channel->name[] (64 characters) or channel->port[] (8 > characters). > > strncpy(channel->name, hostname, strlen(hostname) + 1); > strncpy(channel->port, port, strlen(port) + 1); > > The third argument must be the size of the *destination* buffer, not the > source buffer. Also note that strncpy() doesn't NUL-terminate the > destination string so you must do that manually to ensure there is a NUL > byte at the end of the buffer. > > 2. > channel is leaked in the "Failed to open single connection" error case > in qnio_create_channel(). > > 3. > If host is longer the 63 characters then the ioapi_ctx->channels and > qnio_ctx->channels maps will use different keys due to string truncation > in qnio_create_channel(). This means "Channel already exists" in > qnio_create_channel() and possibly other things will not work as > expected. > >> + if (*cfd < 0) { >> + trace_vxhs_qnio_iio_open(of_vsa_addr); >> + return -ENODEV; >> + } >> + } >> + >> + /* >> + * Open vdisk device >> + */ >> + *rfd = iio_devopen(global_qnio_ctx, *cfd, file_name, 0); > > libqnio comment: > Buffer overflow in iio_devopen() since chandev[128] is not large enough > to hold channel[100] + " " + devpath[arbitrary length] chars: > > sprintf(chandev, "%s %s", channel, devpath); > >> + >> + if (*rfd < 0) { >> + if (*cfd >= 0) { > > This check is always true. Otherwise the return -ENODEV would have been > taken above. The if statement isn't necessary. > >> +static void vxhs_check_failover_status(int res, void *ctx) >> +{ >> + BDRVVXHSState *s = ctx; >> + >> + if (res == 0) { >> + /* found failover target */ >> + s->vdisk_cur_host_idx = s->vdisk_ask_failover_idx; >> + s->vdisk_ask_failover_idx = 0; >> + trace_vxhs_check_failover_status( >> + s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, >> + s->vdisk_guid); >> + qemu_spin_lock(&s->vdisk_lock); >> + OF_VDISK_RESET_IOFAILOVER_IN_PROGRESS(s); >> + qemu_spin_unlock(&s->vdisk_lock); >> + vxhs_handle_queued_ios(s); >> + } else { >> + /* keep looking */ >> + trace_vxhs_check_failover_status_retry(s->vdisk_guid); >> + s->vdisk_ask_failover_idx++; >> + if (s->vdisk_ask_failover_idx == s->vdisk_nhosts) { >> + /* pause and cycle through list again */ >> + sleep(QNIO_CONNECT_RETRY_SECS); > > This code is called from a QEMU thread via vxhs_aio_rw(). It is not > permitted to call sleep() since it will freeze QEMU and probably the > guest. > > If you need a timer you can use QEMU's timer APIs. See aio_timer_new(), > timer_new_ns(), timer_mod(), timer_del(), timer_free(). > >> + s->vdisk_ask_failover_idx = 0; >> + } >> + res = vxhs_switch_storage_agent(s); >> + } >> +} >> + >> +static int vxhs_failover_io(BDRVVXHSState *s) >> +{ >> + int res = 0; >> + >> + trace_vxhs_failover_io(s->vdisk_guid); >> + >> + s->vdisk_ask_failover_idx = 0; >> + res = vxhs_switch_storage_agent(s); >> + >> + return res; >> +} >> + >> +static void vxhs_iio_callback(int32_t rfd, uint32_t reason, void *ctx, >> + uint32_t error, uint32_t opcode) > > This function is doing too much. Especially the failover code should > run in the AioContext since it's complex. Don't do failover here > because this function is outside the AioContext lock. Do it from > AioContext using a QEMUBH like block/rbd.c. > >> +static int32_t >> +vxhs_qnio_iio_writev(void *qnio_ctx, uint32_t rfd, QEMUIOVector *qiov, >> + uint64_t offset, void *ctx, uint32_t flags) >> +{ >> + struct iovec cur; >> + uint64_t cur_offset = 0; >> + uint64_t cur_write_len = 0; >> + int segcount = 0; >> + int ret = 0; >> + int i, nsio = 0; >> + int iovcnt = qiov->niov; >> + struct iovec *iov = qiov->iov; >> + >> + errno = 0; >> + cur.iov_base = 0; >> + cur.iov_len = 0; >> + >> + ret = iio_writev(qnio_ctx, rfd, iov, iovcnt, offset, ctx, flags); > > libqnio comments: > > 1. > There are blocking connect(2) and getaddrinfo(3) calls in iio_writev() > so this may hang for arbitrary amounts of time. This is not permitted > in .bdrv_aio_readv()/.bdrv_aio_writev(). Please make qnio actually > asynchronous. > > 2. > Where does client_callback() free reply? It looks like every reply > message causes a memory leak! > > 3. > Buffer overflow in iio_writev() since device[128] cannot fit the device > string generated from the vdisk_guid. > > 4. > Buffer overflow in iio_writev() due to > strncpy(msg->hinfo.target,device,strlen(device)) where device[128] is > larger than target[64]. Also note the previous comments about strncpy() > usage. > > 5. > I don't see any endianness handling or portable alignment of struct > fields in the network protocol code. Binary network protocols need to > take care of these issue for portability. This means libqnio compiled > for different architectures will not work. Do you plan to support any > other architectures besides x86? > > 6. > The networking code doesn't look robust: kvset uses assert() on input > from the network so the other side of the connection could cause SIGABRT > (coredump), the client uses the msg pointer as the cookie for the > response packet so the server can easily crash the client by sending a > bogus cookie value, etc. Even on the client side these things are > troublesome but on a server they are guaranteed security issues. I > didn't look into it deeply. Please audit the code. > >> +static int vxhs_qemu_init(QDict *options, BDRVVXHSState *s, >> + int *cfd, int *rfd, Error **errp) >> +{ >> + QDict *backing_options = NULL; >> + QemuOpts *opts, *tcp_opts; >> + const char *vxhs_filename; >> + char *of_vsa_addr = NULL; >> + Error *local_err = NULL; >> + const char *vdisk_id_opt; >> + char *file_name = NULL; >> + size_t num_servers = 0; >> + char *str = NULL; >> + int ret = 0; >> + int i; >> + >> + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); >> + qemu_opts_absorb_qdict(opts, options, &local_err); >> + if (local_err) { >> + error_propagate(errp, local_err); >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + vxhs_filename = qemu_opt_get(opts, VXHS_OPT_FILENAME); >> + if (vxhs_filename) { >> + trace_vxhs_qemu_init_filename(vxhs_filename); >> + } >> + >> + vdisk_id_opt = qemu_opt_get(opts, VXHS_OPT_VDISK_ID); >> + if (!vdisk_id_opt) { >> + error_setg(&local_err, QERR_MISSING_PARAMETER, VXHS_OPT_VDISK_ID); >> + ret = -EINVAL; >> + goto out; >> + } >> + s->vdisk_guid = g_strdup(vdisk_id_opt); >> + trace_vxhs_qemu_init_vdisk(vdisk_id_opt); >> + >> + num_servers = qdict_array_entries(options, VXHS_OPT_SERVER); >> + if (num_servers < 1) { >> + error_setg(&local_err, QERR_MISSING_PARAMETER, "server"); >> + ret = -EINVAL; >> + goto out; >> + } else if (num_servers > VXHS_MAX_HOSTS) { >> + error_setg(&local_err, QERR_INVALID_PARAMETER, "server"); >> + error_append_hint(errp, "Maximum %d servers allowed.\n", >> + VXHS_MAX_HOSTS); >> + ret = -EINVAL; >> + goto out; >> + } >> + trace_vxhs_qemu_init_numservers(num_servers); >> + >> + for (i = 0; i < num_servers; i++) { >> + str = g_strdup_printf(VXHS_OPT_SERVER"%d.", i); >> + qdict_extract_subqdict(options, &backing_options, str); >> + >> + /* Create opts info from runtime_tcp_opts list */ >> + tcp_opts = qemu_opts_create(&runtime_tcp_opts, NULL, 0, >> &error_abort); >> + qemu_opts_absorb_qdict(tcp_opts, backing_options, &local_err); >> + if (local_err) { >> + qdict_del(backing_options, str); > > backing_options is leaked and there's no need to delete the str key. > >> + qemu_opts_del(tcp_opts); >> + g_free(str); >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + s->vdisk_hostinfo[i].hostip = g_strdup(qemu_opt_get(tcp_opts, >> + VXHS_OPT_HOST)); >> + s->vdisk_hostinfo[i].port = g_ascii_strtoll(qemu_opt_get(tcp_opts, >> + >> VXHS_OPT_PORT), >> + NULL, 0); > > This will segfault if the port option was missing. > >> + >> + s->vdisk_hostinfo[i].qnio_cfd = -1; >> + s->vdisk_hostinfo[i].vdisk_rfd = -1; >> + trace_vxhs_qemu_init(s->vdisk_hostinfo[i].hostip, >> + s->vdisk_hostinfo[i].port); > > It's not safe to use the %s format specifier for a trace event with a > NULL value. In the case where hostip is NULL this could crash on some > systems. > >> + >> + qdict_del(backing_options, str); >> + qemu_opts_del(tcp_opts); >> + g_free(str); >> + } > > backing_options is leaked. > >> + >> + s->vdisk_nhosts = i; >> + s->vdisk_cur_host_idx = 0; >> + file_name = g_strdup_printf("%s%s", vdisk_prefix, s->vdisk_guid); >> + of_vsa_addr = g_strdup_printf("of://%s:%d", >> + >> s->vdisk_hostinfo[s->vdisk_cur_host_idx].hostip, >> + >> s->vdisk_hostinfo[s->vdisk_cur_host_idx].port); > > Can we get here with num_servers == 0? In that case this would access > uninitialized memory. I guess num_servers == 0 does not make sense and > there should be an error case for it. > >> + >> + /* >> + * .bdrv_open() and .bdrv_create() run under the QEMU global mutex. >> + */ >> + if (global_qnio_ctx == NULL) { >> + global_qnio_ctx = vxhs_setup_qnio(); > > libqnio comment: > The client epoll thread should mask all signals (like > qemu_thread_create()). Otherwise it may receive signals that it cannot > deal with. > >> + if (global_qnio_ctx == NULL) { >> + error_setg(&local_err, "Failed vxhs_setup_qnio"); >> + ret = -EINVAL; >> + goto out; >> + } >> + } >> + >> + ret = vxhs_qnio_iio_open(cfd, of_vsa_addr, rfd, file_name); >> + if (!ret) { >> + error_setg(&local_err, "Failed qnio_iio_open"); >> + ret = -EIO; >> + } > > The return value of vxhs_qnio_iio_open() is 0 for success or -errno for > error. > > I guess you never ran this code! The block driver won't even open > successfully. > >> + >> +out: >> + g_free(file_name); >> + g_free(of_vsa_addr); >> + qemu_opts_del(opts); >> + >> + if (ret < 0) { >> + for (i = 0; i < num_servers; i++) { >> + g_free(s->vdisk_hostinfo[i].hostip); >> + } >> + g_free(s->vdisk_guid); >> + s->vdisk_guid = NULL; >> + errno = -ret; > > There is no need to set errno here. The return value already contains > the error and the caller doesn't look at errno. > >> + } >> + error_propagate(errp, local_err); >> + >> + return ret; >> +} >> + >> +static int vxhs_open(BlockDriverState *bs, QDict *options, >> + int bdrv_flags, Error **errp) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + AioContext *aio_context; >> + int qemu_qnio_cfd = -1; >> + int device_opened = 0; >> + int qemu_rfd = -1; >> + int ret = 0; >> + int i; >> + >> + ret = vxhs_qemu_init(options, s, &qemu_qnio_cfd, &qemu_rfd, errp); >> + if (ret < 0) { >> + trace_vxhs_open_fail(ret); >> + return ret; >> + } >> + >> + device_opened = 1; >> + s->qnio_ctx = global_qnio_ctx; >> + s->vdisk_hostinfo[0].qnio_cfd = qemu_qnio_cfd; >> + s->vdisk_hostinfo[0].vdisk_rfd = qemu_rfd; >> + s->vdisk_size = 0; >> + QSIMPLEQ_INIT(&s->vdisk_aio_retryq); >> + >> + /* >> + * Create a pipe for communicating between two threads in different >> + * context. Set handler for read event, which gets triggered when >> + * IO completion is done by non-QEMU context. >> + */ >> + ret = qemu_pipe(s->fds); >> + if (ret < 0) { >> + trace_vxhs_open_epipe('.'); >> + ret = -errno; >> + goto errout; > > This leaks s->vdisk_guid, s->vdisk_hostinfo[i].hostip, etc. > bdrv_close() will not be called so this function must do cleanup itself. > >> + } >> + fcntl(s->fds[VDISK_FD_READ], F_SETFL, O_NONBLOCK); >> + >> + aio_context = bdrv_get_aio_context(bs); >> + aio_set_fd_handler(aio_context, s->fds[VDISK_FD_READ], >> + false, vxhs_aio_event_reader, NULL, s); >> + >> + /* >> + * Initialize the spin-locks. >> + */ >> + qemu_spin_init(&s->vdisk_lock); >> + qemu_spin_init(&s->vdisk_acb_lock); >> + >> + return 0; >> + >> +errout: >> + /* >> + * Close remote vDisk device if it was opened earlier >> + */ >> + if (device_opened) { > > This is always true. The device_opened variable can be removed. > >> +/* >> + * This allocates QEMU-VXHS callback for each IO >> + * and is passed to QNIO. When QNIO completes the work, >> + * it will be passed back through the callback. >> + */ >> +static BlockAIOCB *vxhs_aio_rw(BlockDriverState *bs, >> + int64_t sector_num, QEMUIOVector *qiov, >> + int nb_sectors, >> + BlockCompletionFunc *cb, >> + void *opaque, int iodir) >> +{ >> + VXHSAIOCB *acb = NULL; >> + BDRVVXHSState *s = bs->opaque; >> + size_t size; >> + uint64_t offset; >> + int iio_flags = 0; >> + int ret = 0; >> + void *qnio_ctx = s->qnio_ctx; >> + uint32_t rfd = s->vdisk_hostinfo[s->vdisk_cur_host_idx].vdisk_rfd; >> + >> + offset = sector_num * BDRV_SECTOR_SIZE; >> + size = nb_sectors * BDRV_SECTOR_SIZE; >> + >> + acb = qemu_aio_get(&vxhs_aiocb_info, bs, cb, opaque); >> + /* >> + * Setup or initialize VXHSAIOCB. >> + * Every single field should be initialized since >> + * acb will be picked up from the slab without >> + * initializing with zero. >> + */ >> + acb->io_offset = offset; >> + acb->size = size; >> + acb->ret = 0; >> + acb->flags = 0; >> + acb->aio_done = VXHS_IO_INPROGRESS; >> + acb->segments = 0; >> + acb->buffer = 0; >> + acb->qiov = qiov; >> + acb->direction = iodir; >> + >> + qemu_spin_lock(&s->vdisk_lock); >> + if (OF_VDISK_FAILED(s)) { >> + trace_vxhs_aio_rw(s->vdisk_guid, iodir, size, offset); >> + qemu_spin_unlock(&s->vdisk_lock); >> + goto errout; >> + } >> + if (OF_VDISK_IOFAILOVER_IN_PROGRESS(s)) { >> + QSIMPLEQ_INSERT_TAIL(&s->vdisk_aio_retryq, acb, retry_entry); >> + s->vdisk_aio_retry_qd++; >> + OF_AIOCB_FLAGS_SET_QUEUED(acb); >> + qemu_spin_unlock(&s->vdisk_lock); >> + trace_vxhs_aio_rw_retry(s->vdisk_guid, acb, 1); >> + goto out; >> + } >> + s->vdisk_aio_count++; >> + qemu_spin_unlock(&s->vdisk_lock); >> + >> + iio_flags = (IIO_FLAG_DONE | IIO_FLAG_ASYNC); >> + >> + switch (iodir) { >> + case VDISK_AIO_WRITE: >> + vxhs_inc_acb_segment_count(acb, 1); >> + ret = vxhs_qnio_iio_writev(qnio_ctx, rfd, qiov, >> + offset, (void *)acb, iio_flags); >> + break; >> + case VDISK_AIO_READ: >> + vxhs_inc_acb_segment_count(acb, 1); >> + ret = vxhs_qnio_iio_readv(qnio_ctx, rfd, qiov, >> + offset, (void *)acb, iio_flags); >> + break; >> + default: >> + trace_vxhs_aio_rw_invalid(iodir); >> + goto errout; > > s->vdisk_aio_count must be decremented before returning. > >> +static void vxhs_close(BlockDriverState *bs) >> +{ >> + BDRVVXHSState *s = bs->opaque; >> + int i; >> + >> + trace_vxhs_close(s->vdisk_guid); >> + close(s->fds[VDISK_FD_READ]); >> + close(s->fds[VDISK_FD_WRITE]); >> + >> + /* >> + * Clearing all the event handlers for oflame registered to QEMU >> + */ >> + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fds[VDISK_FD_READ], >> + false, NULL, NULL, NULL); > > Please remove the event handler before closing the fd. I don't think it > matters in this case but in other scenarios there could be race > conditions if another thread opens an fd and the file descriptor number > is reused.