tech@: The below diff splits out virtio device emulation for virtio block and network devices into separate fork+exec'd & pledge(2)'d subprocesses.
In order of priority, this diff: 1. Isolates common exploit targets (e.g. emulated network devices) from the rest of the vm process, tightening pledge to "stdio" per device. 2. Increases responsiveness of guest i/o since we no longer have a single thread servicing virtio pci and device emulation. I'd like to land this diff this week, so if you use atypical vmd configurations like: 1. multiple vioblk disks per vm 2. multiple nics per vm 3. send/receive 4. qcow2 base images This diff has lots of info logging enabled by default to help me identify what's breaking, so please reply with log message output if something goes sideways. -dv diff refs/heads/master refs/heads/vmd-dev-exec5 commit - c1729c40788967aa8f85d17d2c7dd41b829a98b5 commit + 41bcacdba04d4f89c2ca0554baba6fc1041613f7 blob - d0e7d0c2fb1c11e39caea6896726b25ec315cd22 blob + e9387ec59530d7f441ee92687841634685991344 --- usr.sbin/vmd/Makefile +++ usr.sbin/vmd/Makefile @@ -7,7 +7,7 @@ SRCS+= vm_agentx.c SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c SRCS+= ns8250.c i8253.c dhcp.c packet.c mmio.c SRCS+= parse.y atomicio.c vioscsi.c vioraw.c vioqcow2.c fw_cfg.c -SRCS+= vm_agentx.c +SRCS+= vm_agentx.c vioblk.c vionet.c CFLAGS+= -Wall -I${.CURDIR} CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes blob - 7d7d8d02a263f6e426e359f4d8939fffe1f7d365 blob + a6fdb7623e5222248b7a22a907302a84376ffce3 --- usr.sbin/vmd/dhcp.c +++ usr.sbin/vmd/dhcp.c @@ -43,8 +43,9 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t extern struct vmd *env; ssize_t -dhcp_request(struct vionet_dev *dev, char *buf, size_t buflen, char **obuf) +dhcp_request(struct virtio_dev *dev, char *buf, size_t buflen, char **obuf) { + struct vionet_dev *vionet = &dev->vionet; unsigned char *respbuf = NULL, *op, *oe, dhcptype = 0; unsigned char *opts = NULL; ssize_t offset, optslen, respbuflen = 0; @@ -65,10 +66,10 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t return (-1); if (memcmp(pc.pc_dmac, broadcast, ETHER_ADDR_LEN) != 0 && - memcmp(pc.pc_dmac, dev->hostmac, ETHER_ADDR_LEN) != 0) + memcmp(pc.pc_dmac, vionet->hostmac, ETHER_ADDR_LEN) != 0) return (-1); - if (memcmp(pc.pc_smac, dev->mac, ETHER_ADDR_LEN) != 0) + if (memcmp(pc.pc_smac, vionet->mac, ETHER_ADDR_LEN) != 0) return (-1); if ((offset = decode_udp_ip_header(buf, buflen, offset, &pc)) < 0) @@ -87,7 +88,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t if (req.op != BOOTREQUEST || req.htype != pc.pc_htype || req.hlen != ETHER_ADDR_LEN || - memcmp(dev->mac, req.chaddr, req.hlen) != 0) + memcmp(vionet->mac, req.chaddr, req.hlen) != 0) return (-1); /* Ignore unsupported requests for now */ @@ -134,7 +135,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t resp.hlen = req.hlen; resp.xid = req.xid; - if (dev->pxeboot) { + if (vionet->pxeboot) { strlcpy(resp.file, "auto_install", sizeof resp.file); vm = vm_getbyvmid(dev->vm_vmid); if (vm && res_hnok(vm->vm_params.vmc_params.vcp_name)) @@ -143,7 +144,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t if ((client_addr.s_addr = vm_priv_addr(&env->vmd_cfg, - dev->vm_vmid, dev->idx, 1)) == 0) + dev->vm_vmid, vionet->idx, 1)) == 0) return (-1); memcpy(&resp.yiaddr, &client_addr, sizeof(client_addr)); @@ -152,7 +153,7 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t ss2sin(&pc.pc_dst)->sin_port = htons(CLIENT_PORT); if ((server_addr.s_addr = vm_priv_addr(&env->vmd_cfg, dev->vm_vmid, - dev->idx, 0)) == 0) + vionet->idx, 0)) == 0) return (-1); memcpy(&resp.siaddr, &server_addr, sizeof(server_addr)); memcpy(&ss2sin(&pc.pc_src)->sin_addr, &server_addr, @@ -167,9 +168,9 @@ dhcp_request(struct vionet_dev *dev, char *buf, size_t if ((respbuf = calloc(1, respbuflen)) == NULL) goto fail; - memcpy(&pc.pc_dmac, dev->mac, sizeof(pc.pc_dmac)); - memcpy(&resp.chaddr, dev->mac, resp.hlen); - memcpy(&pc.pc_smac, dev->mac, sizeof(pc.pc_smac)); + memcpy(&pc.pc_dmac, vionet->mac, sizeof(pc.pc_dmac)); + memcpy(&resp.chaddr, vionet->mac, resp.hlen); + memcpy(&pc.pc_smac, vionet->mac, sizeof(pc.pc_smac)); pc.pc_smac[5]++; if ((offset = assemble_hw_header(respbuf, respbuflen, 0, &pc, HTYPE_ETHER)) < 0) { blob - 38805432f810906db43448fe6b24b7fa5e05ad31 blob + a94493438b5a3db0b6953ea5e5609435293cd1f9 --- usr.sbin/vmd/vioqcow2.c +++ usr.sbin/vmd/vioqcow2.c @@ -110,9 +110,8 @@ static void qc2_close(void *, int); static void qc2_close(void *, int); /* - * Initializes a raw disk image backing file from an fd. - * Stores the number of 512 byte sectors in *szp, - * returning -1 for error, 0 for success. + * Initializes a raw disk image backing file from an fd. Stores the + * number of bytes in *szp, returning -1 for error, 0 for success. * * May open snapshot base images. */ @@ -132,7 +131,7 @@ virtio_qcow2_init(struct virtio_backing *file, off_t * file->pread = qc2_pread; file->pwrite = qc2_pwrite; file->close = qc2_close; - *szp = diskp->disksz; + *szp = diskp->disksz / 512; return 0; } blob - /dev/null blob + 5ecf349292482d03433761352cbc9ec91b695bc1 (mode 644) --- /dev/null +++ usr.sbin/vmd/vioblk.c @@ -0,0 +1,1011 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2023 Dave Voutila <d...@openbsd.org> + * Copyright (c) 2015 Mike Larkin <mlar...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <sys/mman.h> +#include <sys/param.h> /* PAGE_SIZE */ +#include <sys/socket.h> + +#include <machine/vmmvar.h> +#include <dev/pci/virtio_pcireg.h> +#include <dev/pv/vioblkreg.h> +#include <dev/pv/virtioreg.h> + +#include <errno.h> +#include <event.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "atomicio.h" +#include "pci.h" +#include "virtio.h" +#include "vmd.h" + +extern char *__progname; +extern struct vmd_vm *current_vm; + +static const char *disk_type(int); +static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *); +static int handle_io_write(struct viodev_msg *, struct virtio_dev *); +void vioblk_notify_rx(struct vioblk_dev *); +int vioblk_notifyq(struct vioblk_dev *); + +static void dev_dispatch_vm(int, short, void *); +static void handle_sync_io(int, short, void *); + +static const char * +disk_type(int type) +{ + switch (type) { + case VMDF_RAW: return "raw"; + case VMDF_QCOW2: return "qcow2"; + } + return "unknown"; +} + +__dead void +vioblk_main(int fd) +{ + struct virtio_dev dev; + struct vioblk_dev *vioblk; + struct viodev_msg msg; + struct vmd_vm vm; + struct vm_create_params *vcp; + ssize_t sz; + off_t szp = 0; + int i, ret, type; + + log_procinit("vioblk"); + + /* + * We don't need access to the filesystem, but we aren't root, so + * unveil /var/empty with no permissions. + */ + if (unveil("/var/empty", "") == -1) + fatal("unveil /var/empty"); + if (unveil(NULL, NULL) == -1) + fatal("unveil lock"); + + /* stdio - needed for read/write to disk fds and channels to the vm. */ + if (pledge("stdio", NULL) == -1) + fatal("pledge"); + + /* Receive our virtio_dev, mostly preconfigured. */ + memset(&dev, 0, sizeof(dev)); + sz = atomicio(read, fd, &dev, sizeof(dev)); + if (sz != sizeof(dev)) { + ret = errno; + log_warn("failed to receive vionet"); + goto fail; + } + if (dev.dev_type != VMD_DEVTYPE_DISK) { + ret = EINVAL; + log_warn("received invalid device type"); + goto fail; + } + dev.sync_fd = fd; + vioblk = &dev.vioblk; + + log_info("%s: got viblk dev. num disk fds = %d, sync fd = %d, " + "async fd = %d, sz = %lld maxfer = %d", __func__, vioblk->ndisk_fd, + dev.sync_fd, dev.async_fd, vioblk->sz, vioblk->max_xfer); + + /* Receive our vm information from the vm process. */ + memset(&vm, 0, sizeof(vm)); + sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); + if (sz != sizeof(vm)) { + ret = EIO; + log_warnx("failed to receive vm details"); + goto fail; + } + vcp = &vm.vm_params.vmc_params; + current_vm = &vm; + setproctitle("%s/vioblk[%d]", vcp->vcp_name, vioblk->idx); + + /* Now that we have our vm information, we can remap memory. */ + ret = remap_guest_mem(&vm); + if (ret) { + log_warnx("failed to remap guest memory"); + goto fail; + } + + /* Initialize the virtio block abstractions. */ + type = vm.vm_params.vmc_disktypes[vioblk->idx]; + switch (type) { + case VMDF_RAW: + ret = virtio_raw_init(&vioblk->file, &szp, vioblk->disk_fd, + vioblk->ndisk_fd); + break; + case VMDF_QCOW2: + ret = virtio_qcow2_init(&vioblk->file, &szp, vioblk->disk_fd, + vioblk->ndisk_fd); + break; + default: + log_warnx("invalid disk image type"); + goto fail; + } + if (ret || szp < 0) { + log_warnx("failed to init disk %s image", disk_type(type)); + goto fail; + } + vioblk->sz = szp; + log_info("%s: initialized vioblk[%d] with %s image (sz=%lld)", __func__, + vioblk->idx, disk_type(type), vioblk->sz); + + /* If we're restoring hardware, reinitialize the virtqueue hva. */ + if (vm.vm_state & VM_STATE_RECEIVED) + vioblk_update_qa(vioblk); + + /* Initialize libevent so we can start wiring event handlers. */ + event_init(); + + /* Wire up an async imsg channel. */ + log_info("%s: wiring in async vm event handler (fd=%d)", __func__, + dev.async_fd); + if (vm_device_pipe(&dev, dev_dispatch_vm)) { + ret = EIO; + log_warnx("vm_device_pipe"); + goto fail; + } + + /* Configure our sync channel event handler. */ + log_info("%s: wiring in sync channel handler (fd=%d)", __func__, + dev.sync_fd); + if (fcntl(dev.sync_fd, F_SETFL, O_NONBLOCK) == -1) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto fail; + } + imsg_init(&dev.sync_iev.ibuf, dev.sync_fd); + dev.sync_iev.handler = handle_sync_io; + dev.sync_iev.data = &dev; + dev.sync_iev.events = EV_READ; + imsg_event_add(&dev.sync_iev); + + /* Send a ready message over the sync channel. */ + log_info("%s: telling vm %s device is ready", __func__, vcp->vcp_name); + memset(&msg, 0, sizeof(msg)); + msg.type = VIODEV_MSG_READY; + imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + + /* Send a ready message over the async channel. */ + log_info("%s: sending heartbeat", __func__); + ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1, + &msg, sizeof(msg)); + if (ret == -1) { + log_warnx("%s: failed to send async ready message!", __func__); + goto fail; + } + + /* Engage the event loop! */ + ret = event_dispatch(); + + if (ret == 0) { + /* Clean shutdown. */ + close_fd(dev.sync_fd); + close_fd(dev.async_fd); + for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++) + close_fd(vioblk->disk_fd[i]); + _exit(0); + /* NOTREACHED */ + } + +fail: + /* Try letting the vm know we've failed something. */ + memset(&msg, 0, sizeof(msg)); + msg.type = VIODEV_MSG_ERROR; + msg.data = ret; + imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + imsg_flush(&dev.sync_iev.ibuf); + + close_fd(dev.sync_fd); + close_fd(dev.async_fd); + for (i = 0; i < (int)sizeof(vioblk->disk_fd); i++) + close_fd(vioblk->disk_fd[i]); + _exit(ret); + /* NOTREACHED */ +} + +const char * +vioblk_cmd_name(uint32_t type) +{ + switch (type) { + case VIRTIO_BLK_T_IN: return "read"; + case VIRTIO_BLK_T_OUT: return "write"; + case VIRTIO_BLK_T_SCSI_CMD: return "scsi read"; + case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write"; + case VIRTIO_BLK_T_FLUSH: return "flush"; + case VIRTIO_BLK_T_FLUSH_OUT: return "flush out"; + case VIRTIO_BLK_T_GET_ID: return "get id"; + default: return "unknown"; + } +} + +void +vioblk_update_qa(struct vioblk_dev *dev) +{ + struct virtio_vq_info *vq_info; + void *hva = NULL; + + /* Invalid queue? */ + if (dev->cfg.queue_select > 0) + return; + + vq_info = &dev->vq[dev->cfg.queue_select]; + vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE; + + hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE)); + if (hva == NULL) + fatal("vioblk_update_qa"); + vq_info->q_hva = hva; +} + +void +vioblk_update_qs(struct vioblk_dev *dev) +{ + struct virtio_vq_info *vq_info; + + /* Invalid queue? */ + if (dev->cfg.queue_select > 0) { + dev->cfg.queue_size = 0; + return; + } + + vq_info = &dev->vq[dev->cfg.queue_select]; + + /* Update queue pfn/size based on queue select */ + dev->cfg.queue_pfn = vq_info->q_gpa >> 12; + dev->cfg.queue_size = vq_info->qs; +} + +static void +vioblk_free_info(struct ioinfo *info) +{ + if (!info) + return; + free(info->buf); + free(info); +} + +static struct ioinfo * +vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz) +{ + struct ioinfo *info; + + /* Limit to 64M for now */ + if (sz > (1 << 26)) { + log_warnx("%s: read size exceeded 64M", __func__); + return (NULL); + } + + info = calloc(1, sizeof(*info)); + if (!info) + goto nomem; + info->buf = malloc(sz); + if (info->buf == NULL) + goto nomem; + info->len = sz; + info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; + info->file = &dev->file; + return info; + +nomem: + free(info); + log_warn("malloc error vioblk read"); + return (NULL); +} + + +static const uint8_t * +vioblk_finish_read(struct ioinfo *info) +{ + struct virtio_backing *file; + + file = info->file; + if (file == NULL || file->pread == NULL) { + log_warnx("%s: XXX null?!", __func__); + return NULL; + } + if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) { + info->error = errno; + log_warn("vioblk read error"); + return NULL; + } + + return info->buf; +} + +static struct ioinfo * +vioblk_start_write(struct vioblk_dev *dev, off_t sector, + paddr_t addr, size_t len) +{ + struct ioinfo *info; + + /* Limit to 64M for now */ + if (len > (1 << 26)) { + log_warnx("%s: write size exceeded 64M", __func__); + return (NULL); + } + + info = calloc(1, sizeof(*info)); + if (!info) + goto nomem; + + info->buf = malloc(len); + if (info->buf == NULL) + goto nomem; + info->len = len; + info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; + info->file = &dev->file; + + if (read_mem(addr, info->buf, info->len)) { + vioblk_free_info(info); + return NULL; + } + + return info; + +nomem: + free(info); + log_warn("malloc error vioblk write"); + return (NULL); +} + +static int +vioblk_finish_write(struct ioinfo *info) +{ + struct virtio_backing *file; + + file = info->file; + if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) { + log_warn("vioblk write error"); + return EIO; + } + return 0; +} + +/* + * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can + */ +int +vioblk_notifyq(struct vioblk_dev *dev) +{ + uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx; + uint8_t ds; + int cnt; + off_t secbias; + char *vr; + struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc; + struct vring_avail *avail; + struct vring_used *used; + struct virtio_blk_req_hdr cmd; + struct virtio_vq_info *vq_info; + + /* Invalid queue? */ + if (dev->cfg.queue_notify > 0) + return (0); + + vq_info = &dev->vq[dev->cfg.queue_notify]; + vr = vq_info->q_hva; + if (vr == NULL) + fatalx("%s: null vring", __func__); + + /* Compute offsets in ring of descriptors, avail ring, and used ring */ + desc = (struct vring_desc *)(vr); + avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); + used = (struct vring_used *)(vr + vq_info->vq_usedoffset); + + idx = vq_info->last_avail & VIOBLK_QUEUE_MASK; + + if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) { + log_debug("%s - nothing to do?", __func__); + return (0); + } + + while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) { + + ds = VIRTIO_BLK_S_IOERR; + cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK; + cmd_desc = &desc[cmd_desc_idx]; + + if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) { + log_warnx("unchained vioblk cmd descriptor received " + "(idx %d)", cmd_desc_idx); + goto out; + } + + /* Read command from descriptor ring */ + if (cmd_desc->flags & VRING_DESC_F_WRITE) { + log_warnx("vioblk: unexpected writable cmd descriptor " + "%d", cmd_desc_idx); + goto out; + } + if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) { + log_warnx("vioblk: command read_mem error @ 0x%llx", + cmd_desc->addr); + goto out; + } + + switch (cmd.type) { + case VIRTIO_BLK_T_IN: + /* first descriptor */ + secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + secdata_desc = &desc[secdata_desc_idx]; + + if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { + log_warnx("unchained vioblk data descriptor " + "received (idx %d)", cmd_desc_idx); + goto out; + } + + cnt = 0; + secbias = 0; + do { + struct ioinfo *info; + const uint8_t *secdata; + + if ((secdata_desc->flags & VRING_DESC_F_WRITE) + == 0) { + log_warnx("vioblk: unwritable data " + "descriptor %d", secdata_desc_idx); + goto out; + } + + info = vioblk_start_read(dev, + cmd.sector + secbias, secdata_desc->len); + + if (info == NULL) { + log_warnx("vioblk: can't start read"); + goto out; + } + + /* read the data, use current data descriptor */ + secdata = vioblk_finish_read(info); + if (secdata == NULL) { + vioblk_free_info(info); + log_warnx("vioblk: block read error, " + "sector %lld", cmd.sector); + goto out; + } + + if (write_mem(secdata_desc->addr, secdata, + secdata_desc->len)) { + log_warnx("can't write sector " + "data to gpa @ 0x%llx", + secdata_desc->addr); + vioblk_free_info(info); + goto out; + } + + vioblk_free_info(info); + + secbias += (secdata_desc->len / + VIRTIO_BLK_SECTOR_SIZE); + secdata_desc_idx = secdata_desc->next & + VIOBLK_QUEUE_MASK; + secdata_desc = &desc[secdata_desc_idx]; + + /* Guard against infinite chains */ + if (++cnt >= VIOBLK_QUEUE_SIZE) { + log_warnx("%s: descriptor table " + "invalid", __func__); + goto out; + } + } while (secdata_desc->flags & VRING_DESC_F_NEXT); + + ds_desc_idx = secdata_desc_idx; + ds_desc = secdata_desc; + + ds = VIRTIO_BLK_S_OK; + break; + case VIRTIO_BLK_T_OUT: + secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + secdata_desc = &desc[secdata_desc_idx]; + + if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { + log_warnx("wr vioblk: unchained vioblk data " + "descriptor received (idx %d)", + cmd_desc_idx); + goto out; + } + + if (secdata_desc->len > dev->max_xfer) { + log_warnx("%s: invalid read size %d requested", + __func__, secdata_desc->len); + goto out; + } + + cnt = 0; + secbias = 0; + do { + struct ioinfo *info; + + if (secdata_desc->flags & VRING_DESC_F_WRITE) { + log_warnx("wr vioblk: unexpected " + "writable data descriptor %d", + secdata_desc_idx); + goto out; + } + + info = vioblk_start_write(dev, + cmd.sector + secbias, + secdata_desc->addr, secdata_desc->len); + + if (info == NULL) { + log_warnx("wr vioblk: can't read " + "sector data @ 0x%llx", + secdata_desc->addr); + goto out; + } + + if (vioblk_finish_write(info)) { + log_warnx("wr vioblk: disk write " + "error"); + vioblk_free_info(info); + goto out; + } + + vioblk_free_info(info); + + secbias += secdata_desc->len / + VIRTIO_BLK_SECTOR_SIZE; + + secdata_desc_idx = secdata_desc->next & + VIOBLK_QUEUE_MASK; + secdata_desc = &desc[secdata_desc_idx]; + + /* Guard against infinite chains */ + if (++cnt >= VIOBLK_QUEUE_SIZE) { + log_warnx("%s: descriptor table " + "invalid", __func__); + goto out; + } + } while (secdata_desc->flags & VRING_DESC_F_NEXT); + + ds_desc_idx = secdata_desc_idx; + ds_desc = secdata_desc; + + ds = VIRTIO_BLK_S_OK; + break; + case VIRTIO_BLK_T_FLUSH: + case VIRTIO_BLK_T_FLUSH_OUT: + ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + ds_desc = &desc[ds_desc_idx]; + + ds = VIRTIO_BLK_S_UNSUPP; + break; + case VIRTIO_BLK_T_GET_ID: + secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + secdata_desc = &desc[secdata_desc_idx]; + + /* + * We don't support this command yet. While it's not + * officially part of the virtio spec (will be in v1.2) + * there's no feature to negotiate. Linux drivers will + * often send this command regardless. + * + * When the command is received, it should appear as a + * chain of 3 descriptors, similar to the IN/OUT + * commands. The middle descriptor should have have a + * length of VIRTIO_BLK_ID_BYTES bytes. + */ + if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { + log_warnx("id vioblk: unchained vioblk data " + "descriptor received (idx %d)", + cmd_desc_idx); + goto out; + } + + /* Skip the data descriptor. */ + ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK; + ds_desc = &desc[ds_desc_idx]; + + ds = VIRTIO_BLK_S_UNSUPP; + break; + default: + log_warnx("%s: unsupported command 0x%x", __func__, + cmd.type); + ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; + ds_desc = &desc[ds_desc_idx]; + + ds = VIRTIO_BLK_S_UNSUPP; + break; + } + + if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) { + log_warnx("%s: ds descriptor %d unwritable", __func__, + ds_desc_idx); + goto out; + } + if (write_mem(ds_desc->addr, &ds, sizeof(ds))) { + log_warnx("%s: can't write device status data @ 0x%llx", + __func__, ds_desc->addr); + goto out; + } + + dev->cfg.isr_status = 1; + used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx; + used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len; + __sync_synchronize(); + used->idx++; + + vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK; + idx = (idx + 1) & VIOBLK_QUEUE_MASK; + } +out: + return (1); +} + +static void +dev_dispatch_vm(int fd, short event, void *arg) +{ + struct virtio_dev *dev = (struct virtio_dev *)arg; + struct imsgev *iev = &dev->async_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct imsg imsg; + ssize_t n = 0; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_READ)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_WRITE)", __func__); + event_del(&iev->ev); + event_loopbreak(); + return; + } + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatal("%s: imsg_get", __func__); + if (n == 0) + break; + + switch (imsg.hdr.type) { + case IMSG_VMDOP_PAUSE_VM: + log_debug("%s: pausing", __func__); + break; + case IMSG_VMDOP_UNPAUSE_VM: + log_debug("%s: unpausing", __func__); + break; + default: + log_warnx("%s: unhandled imsg type %d", __func__, + imsg.hdr.type); + break; + } + imsg_free(&imsg); + } + imsg_event_add(iev); +} + +/* + * Synchronous IO handler. + * + */ +static void +handle_sync_io(int fd, short event, void *arg) +{ + struct virtio_dev *dev = (struct virtio_dev *)arg; + struct imsgev *iev = &dev->sync_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct viodev_msg msg; + struct imsg imsg; + ssize_t n; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: vioblk pipe dead (EV_READ)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: vioblk pipe dead (EV_WRITE)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatalx("%s: imsg_get (n=%ld)", __func__, n); + if (n == 0) + break; + + /* Unpack our message. They ALL should be dev messeges! */ + IMSG_SIZE_CHECK(&imsg, &msg); + memcpy(&msg, imsg.data, sizeof(msg)); + imsg_free(&imsg); + + switch (msg.type) { + case VIODEV_MSG_DUMP: + /* Dump device */ + n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev)); + if (n != sizeof(*dev)) { + log_warnx("%s: failed to dump vioblk device", + __func__); + break; + } + case VIODEV_MSG_IO_READ: + /* Read IO: make sure to send a reply */ + msg.data = handle_io_read(&msg, dev); + msg.data_valid = 1; + imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + break; + case VIODEV_MSG_IO_WRITE: + /* Write IO: no reply needed */ + if (handle_io_write(&msg, dev) == 1) + virtio_assert_pic_irq(dev, 0); + break; + case VIODEV_MSG_SHUTDOWN: + event_del(&dev->sync_iev.ev); + event_loopbreak(); + return; + default: + fatalx("%s: invalid msg type %d", __func__, msg.type); + } + } + imsg_event_add(iev); +} + +static int +handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev) +{ + struct vioblk_dev *vioblk = &dev->vioblk; + uint32_t data = msg->data; + int intr = 0; + + switch (msg->reg) { + case VIRTIO_CONFIG_DEVICE_FEATURES: + case VIRTIO_CONFIG_QUEUE_SIZE: + case VIRTIO_CONFIG_ISR_STATUS: + log_warnx("%s: illegal write %x to %s", __progname, data, + virtio_reg_name(msg->reg)); + break; + case VIRTIO_CONFIG_GUEST_FEATURES: + vioblk->cfg.guest_feature = data; + break; + case VIRTIO_CONFIG_QUEUE_PFN: + vioblk->cfg.queue_pfn = data; + vioblk_update_qa(vioblk); + break; + case VIRTIO_CONFIG_QUEUE_SELECT: + vioblk->cfg.queue_select = data; + vioblk_update_qs(vioblk); + break; + case VIRTIO_CONFIG_QUEUE_NOTIFY: + vioblk->cfg.queue_notify = data; + if (vioblk_notifyq(vioblk)) + intr = 1; + break; + case VIRTIO_CONFIG_DEVICE_STATUS: + vioblk->cfg.device_status = data; + if (vioblk->cfg.device_status == 0) { + vioblk->cfg.guest_feature = 0; + vioblk->cfg.queue_pfn = 0; + vioblk_update_qa(vioblk); + vioblk->cfg.queue_size = 0; + vioblk_update_qs(vioblk); + vioblk->cfg.queue_select = 0; + vioblk->cfg.queue_notify = 0; + vioblk->cfg.isr_status = 0; + vioblk->vq[0].last_avail = 0; + vioblk->vq[0].notified_avail = 0; + virtio_deassert_pic_irq(dev, msg->vcpu); + } + break; + default: + break; + } + return (intr); +} + +static uint32_t +handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev) +{ + struct vioblk_dev *vioblk = &dev->vioblk; + uint8_t sz = msg->io_sz; + uint32_t data; + + if (msg->data_valid) + data = msg->data; + else + data = 0; + + switch (msg->reg) { + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: + switch (sz) { + case 4: + data = (uint32_t)(vioblk->sz); + break; + case 2: + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->sz) & 0xFFFF; + break; + case 1: + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz) & 0xFF; + break; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 8) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 16) & 0xFF; + } else if (sz == 2) { + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->sz >> 16) & 0xFFFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 24) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: + switch (sz) { + case 4: + data = (uint32_t)(vioblk->sz >> 32); + break; + case 2: + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->sz >> 32) & 0xFFFF; + break; + case 1: + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 32) & 0xFF; + break; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 40) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 48) & 0xFF; + } else if (sz == 2) { + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->sz >> 48) & 0xFFFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->sz >> 56) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: + switch (sz) { + case 4: + data = (uint32_t)(vioblk->max_xfer); + break; + case 2: + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->max_xfer) & 0xFFFF; + break; + case 1: + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->max_xfer) & 0xFF; + break; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->max_xfer >> 8) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->max_xfer >> 16) & 0xFF; + } else if (sz == 2) { + data &= 0xFFFF0000; + data |= (uint32_t)(vioblk->max_xfer >> 16) + & 0xFFFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11: + if (sz == 1) { + data &= 0xFFFFFF00; + data |= (uint32_t)(vioblk->max_xfer >> 24) & 0xFF; + } + /* XXX handle invalid sz */ + break; + case VIRTIO_CONFIG_DEVICE_FEATURES: + data = vioblk->cfg.device_feature; + break; + case VIRTIO_CONFIG_GUEST_FEATURES: + data = vioblk->cfg.guest_feature; + break; + case VIRTIO_CONFIG_QUEUE_PFN: + data = vioblk->cfg.queue_pfn; + break; + case VIRTIO_CONFIG_QUEUE_SIZE: + data = vioblk->cfg.queue_size; + break; + case VIRTIO_CONFIG_QUEUE_SELECT: + data = vioblk->cfg.queue_select; + break; + case VIRTIO_CONFIG_QUEUE_NOTIFY: + data = vioblk->cfg.queue_notify; + break; + case VIRTIO_CONFIG_DEVICE_STATUS: + data = vioblk->cfg.device_status; + break; + case VIRTIO_CONFIG_ISR_STATUS: + data = vioblk->cfg.isr_status; + vioblk->cfg.isr_status = 0; + virtio_deassert_pic_irq(dev, 0); + break; + default: + return (0xFFFFFFFF); + } + + return (data); +} blob - 174bf406315264ab354e03e513dfc28380e77529 blob + bd40a032529eac7b500a62bc429116ac744dded1 --- usr.sbin/vmd/vioraw.c +++ usr.sbin/vmd/vioraw.c @@ -47,7 +47,7 @@ raw_close(void *file, int stayopen) /* * Initializes a raw disk image backing file from an fd. Stores the - * number of bytes in *szp, returning -1 for error, 0 for success. + * number of 512-byte sectors in *szp, returning -1 for error, 0 for success. */ int virtio_raw_init(struct virtio_backing *file, off_t *szp, int *fd, size_t nfd) @@ -57,6 +57,7 @@ virtio_raw_init(struct virtio_backing *file, off_t *sz if (nfd != 1) return (-1); + sz = lseek(fd[0], 0, SEEK_END); if (sz == -1) return (-1); @@ -69,7 +70,7 @@ virtio_raw_init(struct virtio_backing *file, off_t *sz file->pread = raw_pread; file->pwrite = raw_pwrite; file->close = raw_close; - *szp = sz; + *szp = sz / 512; return (0); } blob - /dev/null blob + 4e6033e3404bc9f8e16a56f468e422476595b5b8 (mode 644) --- /dev/null +++ usr.sbin/vmd/vionet.c @@ -0,0 +1,938 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2023 Dave Voutila <d...@openbsd.org> + * Copyright (c) 2015 Mike Larkin <mlar...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include <sys/mman.h> +#include <sys/param.h> /* PAGE_SIZE */ +#include <sys/socket.h> + +#include <dev/pci/virtio_pcireg.h> +#include <dev/pv/virtioreg.h> + +#include <machine/vmmvar.h> + +#include <net/if.h> +#include <netinet/in.h> +#include <netinet/if_ether.h> +#include <netinet/ip.h> + +#include <errno.h> +#include <event.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "atomicio.h" +#include "pci.h" +#include "virtio.h" +#include "vmd.h" + +#define VIRTIO_NET_F_MAC (1 << 5) +#define RXQ 0 +#define TXQ 1 + +extern char *__progname; +extern struct vmd_vm *current_vm; + +/* Device Globals */ +struct event ev_tap; + +static int vionet_rx(struct vionet_dev *); +static void vionet_rx_event(int, short, void *); +static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *); +static int handle_io_write(struct viodev_msg *, struct virtio_dev *); +void vionet_notify_rx(struct virtio_dev *); +int vionet_notifyq(struct virtio_dev *); + +static void dev_dispatch_vm(int, short, void *); +static void handle_sync_io(int, short, void *); + +__dead void +vionet_main(int fd) +{ + struct virtio_dev dev; + struct vionet_dev *vionet = NULL; + struct viodev_msg msg; + struct vmd_vm vm; + struct vm_create_params *vcp; + ssize_t sz; + int ret; + + log_procinit("vionet"); + + /* + * We don't need access to the filesystem, but we aren't root, so + * unveil /var/empty with no permissions. + */ + if (unveil("/var/empty", "") == -1) + fatal("unveil /var/empty"); + if (unveil(NULL, NULL) == -1) + fatal("unveil lock"); + + /* stdio - needed for read/write to tap fd and channels to the vm. */ + if (pledge("stdio", NULL) == -1) + fatal("pledge"); + + /* Receive our vionet_dev, mostly preconfigured. */ + sz = atomicio(read, fd, &dev, sizeof(dev)); + if (sz != sizeof(dev)) { + ret = errno; + log_warn("failed to receive vionet"); + goto fail; + } + if (dev.dev_type != VMD_DEVTYPE_NET) { + ret = EINVAL; + log_warn("received invalid device type"); + goto fail; + } + dev.sync_fd = fd; + vionet = &dev.vionet; + + log_info("%s: got vionet dev. tap fd = %d, syncfd = %d, asyncfd = %d", + __func__, vionet->data_fd, dev.sync_fd, dev.async_fd); + + /* Receive our vm information from the vm process. */ + memset(&vm, 0, sizeof(vm)); + sz = atomicio(read, dev.sync_fd, &vm, sizeof(vm)); + if (sz != sizeof(vm)) { + ret = EIO; + log_warnx("failed to receive vm details"); + goto fail; + } + vcp = &vm.vm_params.vmc_params; + current_vm = &vm; + setproctitle("%s/vionet[%d]", vcp->vcp_name, vionet->idx); + + /* Now that we have our vm information, we can remap memory. */ + ret = remap_guest_mem(&vm); + if (ret) + goto fail; + + /* If we're restoring hardware, re-initialize virtqueue hva's. */ + if (vm.vm_state & VM_STATE_RECEIVED) { + struct virtio_vq_info *vq_info; + void *hva = NULL; + + vq_info = &dev.vionet.vq[TXQ]; + if (vq_info->q_gpa != 0) { + log_info("%s: restoring TX virtqueue for gpa 0x%llx", + __func__, vq_info->q_gpa); + hva = hvaddr_mem(vq_info->q_gpa, + vring_size(VIONET_QUEUE_SIZE)); + if (hva == NULL) + fatalx("%s: hva == NULL", __func__); + vq_info->q_hva = hva; + } + + vq_info = &dev.vionet.vq[RXQ]; + if (vq_info->q_gpa != 0) { + log_info("%s: restoring RX virtqueue for gpa 0x%llx", + __func__, vq_info->q_gpa); + hva = hvaddr_mem(vq_info->q_gpa, + vring_size(VIONET_QUEUE_SIZE)); + if (hva == NULL) + fatalx("%s: hva == NULL", __func__); + vq_info->q_hva = hva; + } + } + + /* Initialize libevent so we can start wiring event handlers. */ + event_init(); + + /* Wire up an async imsg channel. */ + log_info("%s: wiring in async vm event handler (fd=%d)", __func__, + dev.async_fd); + if (vm_device_pipe(&dev, dev_dispatch_vm)) { + ret = EIO; + log_warnx("vm_device_pipe"); + goto fail; + } + + /* Wire up event handling for the tap fd. */ + log_info("%s: wiring in tap fd handler (fd=%d)", __func__, + vionet->data_fd); + event_set(&ev_tap, vionet->data_fd, EV_READ | EV_PERSIST, + vionet_rx_event, &dev); + + /* Configure our sync channel event handler. */ + log_info("%s: wiring in sync channel handler (fd=%d)", __func__, + dev.sync_fd); + if (fcntl(dev.sync_fd, F_SETFL, O_NONBLOCK) == -1) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto fail; + } + imsg_init(&dev.sync_iev.ibuf, dev.sync_fd); + dev.sync_iev.handler = handle_sync_io; + dev.sync_iev.data = &dev; + dev.sync_iev.events = EV_READ; + imsg_event_add(&dev.sync_iev); + + /* Send a ready message over the sync channel. */ + log_info("%s: telling vm %s device is ready", __func__, vcp->vcp_name); + memset(&msg, 0, sizeof(msg)); + msg.type = VIODEV_MSG_READY; + imsg_compose_event(&dev.sync_iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + + /* Send a ready message over the async channel. */ + log_info("%s: sending async ready message", __func__); + ret = imsg_compose_event(&dev.async_iev, IMSG_DEVOP_MSG, 0, 0, -1, + &msg, sizeof(msg)); + if (ret == -1) { + log_warnx("%s: failed to send async ready message!", __func__); + goto fail; + } + + /* Engage the event loop! */ + ret = event_dispatch(); + + /* Cleanup */ + if (ret == 0) { + close_fd(dev.sync_fd); + close_fd(dev.async_fd); + close_fd(vionet->data_fd); + _exit(ret); + /* NOTREACHED */ + } +fail: + /* Try firing off a message to the vm saying we're dying. */ + memset(&msg, 0, sizeof(msg)); + msg.type = VIODEV_MSG_ERROR; + msg.data = ret; + imsg_compose(&dev.sync_iev.ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + imsg_flush(&dev.sync_iev.ibuf); + + close_fd(dev.sync_fd); + close_fd(dev.async_fd); + if (vionet != NULL) + close_fd(vionet->data_fd); + + _exit(ret); +} + +/* + * Update the gpa and hva of the virtqueue. + */ +void +vionet_update_qa(struct vionet_dev *dev) +{ + struct virtio_vq_info *vq_info; + void *hva = NULL; + + /* Invalid queue? */ + if (dev->cfg.queue_select > 1) + return; + + vq_info = &dev->vq[dev->cfg.queue_select]; + vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE; + dev->cfg.queue_pfn = vq_info->q_gpa >> 12; + + if (vq_info->q_gpa == 0) + vq_info->q_hva = NULL; + + hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE)); + if (hva == NULL) + fatalx("%s: hva == NULL", __func__); + + vq_info->q_hva = hva; +} + +/* + * Update the queue size. + */ +void +vionet_update_qs(struct vionet_dev *dev) +{ + struct virtio_vq_info *vq_info; + + /* Invalid queue? */ + if (dev->cfg.queue_select > 1) { + log_warnx("%s: !!! invalid queue selector %d", __func__, + dev->cfg.queue_select); + dev->cfg.queue_size = 0; + return; + } + + vq_info = &dev->vq[dev->cfg.queue_select]; + + /* Update queue pfn/size based on queue select */ + dev->cfg.queue_pfn = vq_info->q_gpa >> 12; + dev->cfg.queue_size = vq_info->qs; +} + +/* + * vionet_enq_rx + * + * Take a given packet from the host-side tap and copy it into the guest's + * buffers utilizing the rx virtio ring. If the packet length is invalid + * (too small or too large) or if there are not enough buffers available, + * the packet is dropped. + */ +int +vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc) +{ + uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx; + char *vr = NULL; + size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0; + size_t chain_len = 0; + struct vring_desc *desc, *pkt_desc, *hdr_desc; + struct vring_avail *avail; + struct vring_used *used; + struct virtio_vq_info *vq_info; + struct virtio_net_hdr hdr; + size_t hdr_sz; + + if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) { + log_warnx("%s: invalid packet size", __func__); + return (0); + } + + hdr_sz = sizeof(hdr); + + if (!(dev->cfg.device_status + & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) { + log_warnx("%s: driver not ready", __func__); + return (0); + } + + vq_info = &dev->vq[RXQ]; + vr = vq_info->q_hva; + if (vr == NULL) + fatalx("%s: vr == NULL", __func__); + + + /* Compute offsets in ring of descriptors, avail ring, and used ring */ + desc = (struct vring_desc *)(vr); + avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); + used = (struct vring_used *)(vr + vq_info->vq_usedoffset); + + idx = vq_info->last_avail & VIONET_QUEUE_MASK; + if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) { + log_debug("%s: insufficient available buffer capacity, " + "dropping packet.", __func__); + return (0); + } + + hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; + hdr_desc = &desc[hdr_desc_idx]; + + dxx = hdr_desc_idx; + chain_hdr_idx = dxx; + chain_len = 0; + + /* Process the descriptor and walk any potential chain. */ + do { + off = 0; + pkt_desc = &desc[dxx]; + if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) { + log_warnx("%s: invalid descriptor, not writable", + __func__); + return (0); + } + + /* How much data do we get to write? */ + if (sz - bufsz > pkt_desc->len) + chunk_size = pkt_desc->len; + else + chunk_size = sz - bufsz; + + if (chain_len == 0) { + off = hdr_sz; + if (chunk_size == pkt_desc->len) + chunk_size -= off; + } + + /* Write a chunk of data if we need to */ + if (chunk_size && write_mem(pkt_desc->addr + off, + pkt + pkt_offset, chunk_size)) { + log_warnx("%s: failed to write to buffer 0x%llx", + __func__, pkt_desc->addr); + return (0); + } + + chain_len += chunk_size + off; + bufsz += chunk_size; + pkt_offset += chunk_size; + + dxx = pkt_desc->next & VIONET_QUEUE_MASK; + } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT); + + /* Move our marker in the ring...*/ + vq_info->last_avail = (vq_info->last_avail + 1) & + VIONET_QUEUE_MASK; + + /* Prepend the virtio net header in the first buffer. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.hdr_len = hdr_sz; + if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) { + log_warnx("vionet: rx enq header write_mem error @ 0x%llx", + hdr_desc->addr); + return (0); + } + + /* Update the index field in the used ring. This must be done last. */ + dev->cfg.isr_status = 1; + *spc = (vq_info->notified_avail - vq_info->last_avail) + & VIONET_QUEUE_MASK; + + /* Update the list of used buffers. */ + used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx; + used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len; + __sync_synchronize(); + used->idx++; + + return (1); +} + +/* + * vionet_rx + * + * Enqueue data that was received on a tap file descriptor + * to the vionet device queue. + */ +static int +vionet_rx(struct vionet_dev *dev) +{ + char buf[PAGE_SIZE]; + int num_enq = 0, spc = 0; + struct ether_header *eh; + ssize_t sz; + + do { + sz = read(dev->data_fd, buf, sizeof(buf)); + if (sz == -1) { + /* + * If we get EAGAIN, No data is currently available. + * Do not treat this as an error. + */ + if (errno != EAGAIN) + log_warn("%s: read error", __func__); + } else if (sz > 0) { + eh = (struct ether_header *)buf; + if (!dev->lockedmac || + ETHER_IS_MULTICAST(eh->ether_dhost) || + memcmp(eh->ether_dhost, dev->mac, + sizeof(eh->ether_dhost)) == 0) + num_enq += vionet_enq_rx(dev, buf, sz, &spc); + } else if (sz == 0) { + log_debug("%s: no data", __func__); + break; + } + } while (spc > 0 && sz > 0); + + return (num_enq); +} + +/* + * vionet_rx_event + * + * Called when new data can be received on the tap fd of a vionet device. + */ +static void +vionet_rx_event(int fd, short kind, void *arg) +{ + struct virtio_dev *dev = (struct virtio_dev *)arg; + + if (vionet_rx(&dev->vionet) > 0) + virtio_assert_pic_irq(dev, 0); +} + +void +vionet_notify_rx(struct virtio_dev *dev) +{ + struct vionet_dev *vionet = &dev->vionet; + struct vring_avail *avail; + struct virtio_vq_info *vq_info; + char *vr; + + vq_info = &vionet->vq[RXQ]; + vr = vq_info->q_hva; + if (vr == NULL) + fatalx("%s: vr == NULL", __func__); + + /* Compute offset into avail ring */ + avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); + vq_info->notified_avail = avail->idx - 1; +} + +int +vionet_notifyq(struct virtio_dev *dev) +{ + struct vionet_dev *vionet = &dev->vionet; + int ret = 0; + + switch (vionet->cfg.queue_notify) { + case RXQ: + vionet_notify_rx(dev); + break; + case TXQ: + ret = vionet_notify_tx(dev); + break; + default: + /* + * Catch the unimplemented queue ID 2 (control queue) as + * well as any bogus queue IDs. + */ + log_debug("%s: notify for unimplemented queue ID %d", + __func__, vionet->cfg.queue_notify); + break; + } + + return (ret); +} + +int +vionet_notify_tx(struct virtio_dev *dev) +{ + uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt; + size_t pktsz, chunk_size = 0; + ssize_t dhcpsz = 0; + int num_enq, ofs, spc = 0; + char *vr = NULL, *pkt = NULL, *dhcppkt = NULL; + struct vionet_dev *vionet = &dev->vionet; + struct vring_desc *desc, *pkt_desc, *hdr_desc; + struct vring_avail *avail; + struct vring_used *used; + struct virtio_vq_info *vq_info; + struct ether_header *eh; + + if (!(vionet->cfg.device_status + & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) { + log_warnx("%s: driver not ready", __func__); + return (0); + } + + vq_info = &vionet->vq[TXQ]; + vr = vq_info->q_hva; + if (vr == NULL) + fatalx("%s: vr == NULL", __func__); + + /* Compute offsets in ring of descriptors, avail ring, and used ring */ + desc = (struct vring_desc *)(vr); + avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); + used = (struct vring_used *)(vr + vq_info->vq_usedoffset); + + num_enq = 0; + + idx = vq_info->last_avail & VIONET_QUEUE_MASK; + + if ((avail->idx & VIONET_QUEUE_MASK) == idx) + return (0); + + while ((avail->idx & VIONET_QUEUE_MASK) != idx) { + hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; + hdr_desc = &desc[hdr_desc_idx]; + pktsz = 0; + + cnt = 0; + dxx = hdr_desc_idx; + do { + pktsz += desc[dxx].len; + dxx = desc[dxx].next & VIONET_QUEUE_MASK; + + /* + * Virtio 1.0, cs04, section 2.4.5: + * "The number of descriptors in the table is defined + * by the queue size for this virtqueue: this is the + * maximum possible descriptor chain length." + */ + if (++cnt >= VIONET_QUEUE_SIZE) { + log_warnx("%s: descriptor table invalid", + __func__); + goto out; + } + } while (desc[dxx].flags & VRING_DESC_F_NEXT); + + pktsz += desc[dxx].len; + + /* Remove virtio header descriptor len */ + pktsz -= hdr_desc->len; + + /* Drop packets violating device MTU-based limits */ + if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) { + log_warnx("%s: invalid packet size %lu", __func__, + pktsz); + goto drop_packet; + } + pkt = malloc(pktsz); + if (pkt == NULL) { + log_warn("malloc error alloc packet buf"); + goto out; + } + + ofs = 0; + pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK; + pkt_desc = &desc[pkt_desc_idx]; + + while (pkt_desc->flags & VRING_DESC_F_NEXT) { + /* must be not writable */ + if (pkt_desc->flags & VRING_DESC_F_WRITE) { + log_warnx("unexpected writable tx desc " + "%d", pkt_desc_idx); + goto out; + } + + /* Check we don't read beyond allocated pktsz */ + if (pkt_desc->len > pktsz - ofs) { + log_warnx("%s: descriptor len past pkt len", + __func__); + chunk_size = pktsz - ofs; + } else + chunk_size = pkt_desc->len; + + /* Read packet from descriptor ring */ + if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { + log_warnx("vionet: packet read_mem error " + "@ 0x%llx", pkt_desc->addr); + goto out; + } + + ofs += pkt_desc->len; + pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK; + pkt_desc = &desc[pkt_desc_idx]; + } + + /* Now handle tail descriptor - must be not writable */ + if (pkt_desc->flags & VRING_DESC_F_WRITE) { + log_warnx("unexpected writable tx descriptor %d", + pkt_desc_idx); + goto out; + } + + /* Check we don't read beyond allocated pktsz */ + if (pkt_desc->len > pktsz - ofs) { + log_warnx("%s: descriptor len past pkt len", __func__); + chunk_size = pktsz - ofs - pkt_desc->len; + } else + chunk_size = pkt_desc->len; + + /* Read packet from descriptor ring */ + if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { + log_warnx("vionet: packet read_mem error @ " + "0x%llx", pkt_desc->addr); + goto out; + } + + /* reject other source addresses */ + if (vionet->lockedmac && pktsz >= ETHER_HDR_LEN && + (eh = (struct ether_header *)pkt) && + memcmp(eh->ether_shost, vionet->mac, + sizeof(eh->ether_shost)) != 0) + log_debug("vionet: wrong source address %s for vm %d", + ether_ntoa((struct ether_addr *) + eh->ether_shost), dev->vm_id); + else if (vionet->local && + (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) { + log_debug("vionet: dhcp request," + " local response size %zd", dhcpsz); + + /* XXX signed vs unsigned here, funky cast */ + } else if (write(vionet->data_fd, pkt, pktsz) != (int)pktsz) { + log_warnx("vionet: tx failed writing to tap: " + "%d", errno); + goto out; + } + + drop_packet: + vionet->cfg.isr_status = 1; + used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx; + used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len; + __sync_synchronize(); + used->idx++; + + vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK; + idx = (idx + 1) & VIONET_QUEUE_MASK; + + num_enq++; + + free(pkt); + pkt = NULL; + } + + if (dhcpsz > 0) + vionet_enq_rx(vionet, dhcppkt, dhcpsz, &spc); + +out: + free(pkt); + free(dhcppkt); + + return (1); +} + +static void +dev_dispatch_vm(int fd, short event, void *arg) +{ + struct virtio_dev *dev = arg; + struct vionet_dev *vionet = &dev->vionet; + struct imsgev *iev = &dev->async_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct imsg imsg; + ssize_t n = 0; + + if (dev == NULL) + fatalx("%s: missing vionet pointer", __func__); + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_READ)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_WRITE)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatal("%s: imsg_get", __func__); + if (n == 0) + break; + + switch (imsg.hdr.type) { + case IMSG_DEVOP_HOSTMAC: + IMSG_SIZE_CHECK(&imsg, vionet->hostmac); + memcpy(vionet->hostmac, imsg.data, + sizeof(vionet->hostmac)); + log_debug("%s: set hostmac", __func__); + break; + case IMSG_VMDOP_PAUSE_VM: + log_info("%s: pausing", __func__); + event_del(&ev_tap); + break; + case IMSG_VMDOP_UNPAUSE_VM: + log_debug("%s: unpausing", __func__); + if (vionet->cfg.device_status + & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) + event_add(&ev_tap, NULL); + break; + } + imsg_free(&imsg); + } + imsg_event_add(iev); +} + +/* + * Synchronous IO handler. + * + */ +static void +handle_sync_io(int fd, short event, void *arg) +{ + struct virtio_dev *dev = (struct virtio_dev *)arg; + struct imsgev *iev = &dev->sync_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct viodev_msg msg; + struct imsg imsg; + ssize_t n; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_READ)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_WRITE)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatalx("%s: imsg_get (n=%ld)", __func__, n); + if (n == 0) + break; + + /* Unpack our message. They ALL should be dev messeges! */ + IMSG_SIZE_CHECK(&imsg, &msg); + memcpy(&msg, imsg.data, sizeof(msg)); + imsg_free(&imsg); + + switch (msg.type) { + case VIODEV_MSG_DUMP: + /* Dump device */ + n = atomicio(vwrite, dev->sync_fd, dev, sizeof(*dev)); + if (n != sizeof(*dev)) { + log_warnx("%s: failed to dump vioblk device", + __func__); + break; + } + case VIODEV_MSG_IO_READ: + /* Read IO: make sure to send a reply */ + msg.data = handle_io_read(&msg, dev); + msg.data_valid = 1; + imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + break; + case VIODEV_MSG_IO_WRITE: + /* Write IO: no reply needed */ + if (handle_io_write(&msg, dev) == 1) + virtio_assert_pic_irq(dev, 0); + break; + case VIODEV_MSG_SHUTDOWN: + event_del(&dev->sync_iev.ev); + event_del(&ev_tap); + event_loopbreak(); + return; + default: + fatalx("%s: invalid msg type %d", __func__, msg.type); + } + } + imsg_event_add(iev); +} + +static int +handle_io_write(struct viodev_msg *msg, struct virtio_dev *dev) +{ + struct vionet_dev *vionet = &dev->vionet; + uint32_t data = msg->data; + int intr = 0; + + switch (msg->reg) { + case VIRTIO_CONFIG_DEVICE_FEATURES: + case VIRTIO_CONFIG_QUEUE_SIZE: + case VIRTIO_CONFIG_ISR_STATUS: + log_warnx("%s: illegal write %x to %s", __progname, data, + virtio_reg_name(msg->reg)); + break; + case VIRTIO_CONFIG_GUEST_FEATURES: + vionet->cfg.guest_feature = data; + break; + case VIRTIO_CONFIG_QUEUE_PFN: + vionet->cfg.queue_pfn = data; + vionet_update_qa(vionet); + break; + case VIRTIO_CONFIG_QUEUE_SELECT: + vionet->cfg.queue_select = data; + vionet_update_qs(vionet); + break; + case VIRTIO_CONFIG_QUEUE_NOTIFY: + vionet->cfg.queue_notify = data; + if (vionet_notifyq(dev)) + intr = 1; + break; + case VIRTIO_CONFIG_DEVICE_STATUS: + vionet->cfg.device_status = data; + if (vionet->cfg.device_status == 0) { + vionet->cfg.guest_feature = 0; + + vionet->cfg.queue_pfn = 0; + vionet_update_qa(vionet); + + vionet->cfg.queue_size = 0; + vionet_update_qs(vionet); + + vionet->cfg.queue_select = 0; + vionet->cfg.queue_notify = 0; + vionet->cfg.isr_status = 0; + vionet->vq[RXQ].last_avail = 0; + vionet->vq[RXQ].notified_avail = 0; + vionet->vq[TXQ].last_avail = 0; + vionet->vq[TXQ].notified_avail = 0; + virtio_deassert_pic_irq(dev, msg->vcpu); + } + event_del(&ev_tap); + if (vionet->cfg.device_status + & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK) { + if (event_add(&ev_tap, NULL)) + log_warn("%s: could not initialize virtio tap " + "event handler", __func__); + } + break; + default: + break; + } + return (intr); +} + +static uint32_t +handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev) +{ + struct vionet_dev *vionet = &dev->vionet; + uint32_t data; + + switch (msg->reg) { + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: + data = vionet->mac[msg->reg - + VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI]; + break; + case VIRTIO_CONFIG_DEVICE_FEATURES: + data = vionet->cfg.device_feature; + break; + case VIRTIO_CONFIG_GUEST_FEATURES: + data = vionet->cfg.guest_feature; + break; + case VIRTIO_CONFIG_QUEUE_PFN: + data = vionet->cfg.queue_pfn; + break; + case VIRTIO_CONFIG_QUEUE_SIZE: + data = vionet->cfg.queue_size; + break; + case VIRTIO_CONFIG_QUEUE_SELECT: + data = vionet->cfg.queue_select; + break; + case VIRTIO_CONFIG_QUEUE_NOTIFY: + data = vionet->cfg.queue_notify; + break; + case VIRTIO_CONFIG_DEVICE_STATUS: + data = vionet->cfg.device_status; + break; + case VIRTIO_CONFIG_ISR_STATUS: + data = vionet->cfg.isr_status; + vionet->cfg.isr_status = 0; + virtio_deassert_pic_irq(dev, 0); + break; + default: + return (0xFFFFFFFF); + } + + return (data); +} blob - 62616958a0c63c904a39dd29a91284215c661b6e blob + 93c9c01a00d7cef603a5597c0c0c76010811e803 --- usr.sbin/vmd/virtio.c +++ usr.sbin/vmd/virtio.c @@ -18,6 +18,7 @@ #include <sys/param.h> /* PAGE_SIZE */ #include <sys/socket.h> +#include <sys/wait.h> #include <machine/vmmvar.h> #include <dev/pci/pcireg.h> @@ -34,6 +35,7 @@ #include <errno.h> #include <event.h> +#include <fcntl.h> #include <poll.h> #include <stddef.h> #include <stdlib.h> @@ -47,15 +49,16 @@ extern char *__progname; #include "vmd.h" #include "vmm.h" +extern struct vmd *env; extern char *__progname; + +struct event ev_sigchld; struct viornd_dev viornd; -struct vioblk_dev *vioblk; -struct vionet_dev *vionet; struct vioscsi_dev *vioscsi; struct vmmci_dev vmmci; -int nr_vionet; -int nr_vioblk; +/* Devices emulated in subprocesses are inserted into this list. */ +SLIST_HEAD(virtio_dev_head, virtio_dev) virtio_devs; #define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ @@ -68,22 +71,12 @@ const char * #define RXQ 0 #define TXQ 1 +static int virtio_dev_launch(struct vmd_vm *, struct virtio_dev *); +static void virtio_dispatch_dev(int, short, void *); +static void virtio_sighdlr(int, short, void *); +static int handle_dev_msg(struct viodev_msg *, struct virtio_dev *); + const char * -vioblk_cmd_name(uint32_t type) -{ - switch (type) { - case VIRTIO_BLK_T_IN: return "read"; - case VIRTIO_BLK_T_OUT: return "write"; - case VIRTIO_BLK_T_SCSI_CMD: return "scsi read"; - case VIRTIO_BLK_T_SCSI_CMD_OUT: return "scsi write"; - case VIRTIO_BLK_T_FLUSH: return "flush"; - case VIRTIO_BLK_T_FLUSH_OUT: return "flush out"; - case VIRTIO_BLK_T_GET_ID: return "get id"; - default: return "unknown"; - } -} - -static const char * virtio_reg_name(uint8_t reg) { switch (reg) { @@ -95,8 +88,11 @@ virtio_reg_name(uint8_t reg) case VIRTIO_CONFIG_QUEUE_NOTIFY: return "queue notify"; case VIRTIO_CONFIG_DEVICE_STATUS: return "device status"; case VIRTIO_CONFIG_ISR_STATUS: return "isr status"; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: return "device config 0"; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: return "device config 1"; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI...VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: + return "device config 0"; + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: + case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: + return "device config 1"; case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: return "device config 2"; case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 12: return "device config 3"; case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 16: return "device config 4"; @@ -154,7 +150,7 @@ viornd_update_qa(void) hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIORND_QUEUE_SIZE)); if (hva == NULL) - fatal("viornd_update_qa"); + fatalx("viornd_update_qa"); vq_info->q_hva = hva; } @@ -284,1194 +280,9 @@ virtio_rnd_io(int dir, uint16_t reg, uint32_t *data, u } } return (0); -} - -void -vioblk_update_qa(struct vioblk_dev *dev) -{ - struct virtio_vq_info *vq_info; - void *hva = NULL; - - /* Invalid queue? */ - if (dev->cfg.queue_select > 0) - return; - - vq_info = &dev->vq[dev->cfg.queue_select]; - vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE; - - hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIOBLK_QUEUE_SIZE)); - if (hva == NULL) - fatal("vioblk_update_qa"); - vq_info->q_hva = hva; -} - -void -vioblk_update_qs(struct vioblk_dev *dev) -{ - struct virtio_vq_info *vq_info; - - /* Invalid queue? */ - if (dev->cfg.queue_select > 0) { - dev->cfg.queue_size = 0; - return; - } - - vq_info = &dev->vq[dev->cfg.queue_select]; - - /* Update queue pfn/size based on queue select */ - dev->cfg.queue_pfn = vq_info->q_gpa >> 12; - dev->cfg.queue_size = vq_info->qs; -} - -static void -vioblk_free_info(struct ioinfo *info) -{ - if (!info) - return; - free(info->buf); - free(info); -} - -static struct ioinfo * -vioblk_start_read(struct vioblk_dev *dev, off_t sector, size_t sz) -{ - struct ioinfo *info; - - /* Limit to 64M for now */ - if (sz > (1 << 26)) { - log_warnx("%s: read size exceeded 64M", __func__); - return (NULL); - } - - info = calloc(1, sizeof(*info)); - if (!info) - goto nomem; - info->buf = malloc(sz); - if (info->buf == NULL) - goto nomem; - info->len = sz; - info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; - info->file = &dev->file; - - return info; - -nomem: - free(info); - log_warn("malloc error vioblk read"); - return (NULL); -} - - -static const uint8_t * -vioblk_finish_read(struct ioinfo *info) -{ - struct virtio_backing *file; - - file = info->file; - if (file->pread(file->p, info->buf, info->len, info->offset) != info->len) { - info->error = errno; - log_warn("vioblk read error"); - return NULL; - } - - return info->buf; -} - -static struct ioinfo * -vioblk_start_write(struct vioblk_dev *dev, off_t sector, - paddr_t addr, size_t len) -{ - struct ioinfo *info; - - /* Limit to 64M for now */ - if (len > (1 << 26)) { - log_warnx("%s: write size exceeded 64M", __func__); - return (NULL); - } - - info = calloc(1, sizeof(*info)); - if (!info) - goto nomem; - - info->buf = malloc(len); - if (info->buf == NULL) - goto nomem; - info->len = len; - info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; - info->file = &dev->file; - - if (read_mem(addr, info->buf, info->len)) { - vioblk_free_info(info); - return NULL; - } - - return info; - -nomem: - free(info); - log_warn("malloc error vioblk write"); - return (NULL); -} - -static int -vioblk_finish_write(struct ioinfo *info) -{ - struct virtio_backing *file; - - file = info->file; - if (file->pwrite(file->p, info->buf, info->len, info->offset) != info->len) { - log_warn("vioblk write error"); - return EIO; - } - return 0; -} - -/* - * XXX in various cases, ds should be set to VIRTIO_BLK_S_IOERR, if we can - */ -int -vioblk_notifyq(struct vioblk_dev *dev) -{ - uint16_t idx, cmd_desc_idx, secdata_desc_idx, ds_desc_idx; - uint8_t ds; - int cnt; - off_t secbias; - char *vr; - struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc; - struct vring_avail *avail; - struct vring_used *used; - struct virtio_blk_req_hdr cmd; - struct virtio_vq_info *vq_info; - - /* Invalid queue? */ - if (dev->cfg.queue_notify > 0) - return (0); - - vq_info = &dev->vq[dev->cfg.queue_notify]; - vr = vq_info->q_hva; - if (vr == NULL) - fatalx("%s: null vring", __func__); - - /* Compute offsets in ring of descriptors, avail ring, and used ring */ - desc = (struct vring_desc *)(vr); - avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); - used = (struct vring_used *)(vr + vq_info->vq_usedoffset); - - idx = vq_info->last_avail & VIOBLK_QUEUE_MASK; - - if ((avail->idx & VIOBLK_QUEUE_MASK) == idx) { - log_debug("%s - nothing to do?", __func__); - return (0); - } - - while (idx != (avail->idx & VIOBLK_QUEUE_MASK)) { - - ds = VIRTIO_BLK_S_IOERR; - cmd_desc_idx = avail->ring[idx] & VIOBLK_QUEUE_MASK; - cmd_desc = &desc[cmd_desc_idx]; - - if ((cmd_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("unchained vioblk cmd descriptor received " - "(idx %d)", cmd_desc_idx); - goto out; - } - - /* Read command from descriptor ring */ - if (cmd_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("vioblk: unexpected writable cmd descriptor " - "%d", cmd_desc_idx); - goto out; - } - if (read_mem(cmd_desc->addr, &cmd, sizeof(cmd))) { - log_warnx("vioblk: command read_mem error @ 0x%llx", - cmd_desc->addr); - goto out; - } - - switch (cmd.type) { - case VIRTIO_BLK_T_IN: - /* first descriptor */ - secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("unchained vioblk data descriptor " - "received (idx %d)", cmd_desc_idx); - goto out; - } - - cnt = 0; - secbias = 0; - do { - struct ioinfo *info; - const uint8_t *secdata; - - if ((secdata_desc->flags & VRING_DESC_F_WRITE) - == 0) { - log_warnx("vioblk: unwritable data " - "descriptor %d", secdata_desc_idx); - goto out; - } - - info = vioblk_start_read(dev, - cmd.sector + secbias, secdata_desc->len); - - if (info == NULL) { - log_warnx("vioblk: can't start read"); - goto out; - } - - /* read the data, use current data descriptor */ - secdata = vioblk_finish_read(info); - if (secdata == NULL) { - vioblk_free_info(info); - log_warnx("vioblk: block read error, " - "sector %lld", cmd.sector); - goto out; - } - - if (write_mem(secdata_desc->addr, secdata, - secdata_desc->len)) { - log_warnx("can't write sector " - "data to gpa @ 0x%llx", - secdata_desc->addr); - vioblk_free_info(info); - goto out; - } - - vioblk_free_info(info); - - secbias += (secdata_desc->len / - VIRTIO_BLK_SECTOR_SIZE); - secdata_desc_idx = secdata_desc->next & - VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - /* Guard against infinite chains */ - if (++cnt >= VIOBLK_QUEUE_SIZE) { - log_warnx("%s: descriptor table " - "invalid", __func__); - goto out; - } - } while (secdata_desc->flags & VRING_DESC_F_NEXT); - - ds_desc_idx = secdata_desc_idx; - ds_desc = secdata_desc; - - ds = VIRTIO_BLK_S_OK; - break; - case VIRTIO_BLK_T_OUT: - secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("wr vioblk: unchained vioblk data " - "descriptor received (idx %d)", - cmd_desc_idx); - goto out; - } - - if (secdata_desc->len > dev->max_xfer) { - log_warnx("%s: invalid read size %d requested", - __func__, secdata_desc->len); - goto out; - } - - cnt = 0; - secbias = 0; - do { - struct ioinfo *info; - - if (secdata_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("wr vioblk: unexpected " - "writable data descriptor %d", - secdata_desc_idx); - goto out; - } - - info = vioblk_start_write(dev, - cmd.sector + secbias, - secdata_desc->addr, secdata_desc->len); - - if (info == NULL) { - log_warnx("wr vioblk: can't read " - "sector data @ 0x%llx", - secdata_desc->addr); - goto out; - } - - if (vioblk_finish_write(info)) { - log_warnx("wr vioblk: disk write " - "error"); - vioblk_free_info(info); - goto out; - } - - vioblk_free_info(info); - - secbias += secdata_desc->len / - VIRTIO_BLK_SECTOR_SIZE; - - secdata_desc_idx = secdata_desc->next & - VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - /* Guard against infinite chains */ - if (++cnt >= VIOBLK_QUEUE_SIZE) { - log_warnx("%s: descriptor table " - "invalid", __func__); - goto out; - } - } while (secdata_desc->flags & VRING_DESC_F_NEXT); - - ds_desc_idx = secdata_desc_idx; - ds_desc = secdata_desc; - - ds = VIRTIO_BLK_S_OK; - break; - case VIRTIO_BLK_T_FLUSH: - case VIRTIO_BLK_T_FLUSH_OUT: - ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - ds_desc = &desc[ds_desc_idx]; - - ds = VIRTIO_BLK_S_UNSUPP; - break; - case VIRTIO_BLK_T_GET_ID: - secdata_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - secdata_desc = &desc[secdata_desc_idx]; - - /* - * We don't support this command yet. While it's not - * officially part of the virtio spec (will be in v1.2) - * there's no feature to negotiate. Linux drivers will - * often send this command regardless. - * - * When the command is received, it should appear as a - * chain of 3 descriptors, similar to the IN/OUT - * commands. The middle descriptor should have have a - * length of VIRTIO_BLK_ID_BYTES bytes. - */ - if ((secdata_desc->flags & VRING_DESC_F_NEXT) == 0) { - log_warnx("id vioblk: unchained vioblk data " - "descriptor received (idx %d)", - cmd_desc_idx); - goto out; - } - - /* Skip the data descriptor. */ - ds_desc_idx = secdata_desc->next & VIOBLK_QUEUE_MASK; - ds_desc = &desc[ds_desc_idx]; - - ds = VIRTIO_BLK_S_UNSUPP; - break; - default: - log_warnx("%s: unsupported command 0x%x", __func__, - cmd.type); - ds_desc_idx = cmd_desc->next & VIOBLK_QUEUE_MASK; - ds_desc = &desc[ds_desc_idx]; - - ds = VIRTIO_BLK_S_UNSUPP; - break; - } - - if ((ds_desc->flags & VRING_DESC_F_WRITE) == 0) { - log_warnx("%s: ds descriptor %d unwritable", __func__, - ds_desc_idx); - goto out; - } - if (write_mem(ds_desc->addr, &ds, sizeof(ds))) { - log_warnx("%s: can't write device status data @ 0x%llx", - __func__, ds_desc->addr); - goto out; - } - - dev->cfg.isr_status = 1; - used->ring[used->idx & VIOBLK_QUEUE_MASK].id = cmd_desc_idx; - used->ring[used->idx & VIOBLK_QUEUE_MASK].len = cmd_desc->len; - __sync_synchronize(); - used->idx++; - - vq_info->last_avail = avail->idx & VIOBLK_QUEUE_MASK; - idx = (idx + 1) & VIOBLK_QUEUE_MASK; - } -out: - return (1); } int -virtio_blk_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, - void *cookie, uint8_t sz) -{ - struct vioblk_dev *dev = (struct vioblk_dev *)cookie; - - *intr = 0xFF; - - - if (dir == 0) { - switch (reg) { - case VIRTIO_CONFIG_DEVICE_FEATURES: - case VIRTIO_CONFIG_QUEUE_SIZE: - case VIRTIO_CONFIG_ISR_STATUS: - log_warnx("%s: illegal write %x to %s", - __progname, *data, virtio_reg_name(reg)); - break; - case VIRTIO_CONFIG_GUEST_FEATURES: - dev->cfg.guest_feature = *data; - break; - case VIRTIO_CONFIG_QUEUE_PFN: - dev->cfg.queue_pfn = *data; - vioblk_update_qa(dev); - break; - case VIRTIO_CONFIG_QUEUE_SELECT: - dev->cfg.queue_select = *data; - vioblk_update_qs(dev); - break; - case VIRTIO_CONFIG_QUEUE_NOTIFY: - dev->cfg.queue_notify = *data; - if (vioblk_notifyq(dev)) - *intr = 1; - break; - case VIRTIO_CONFIG_DEVICE_STATUS: - dev->cfg.device_status = *data; - if (dev->cfg.device_status == 0) { - log_debug("%s: device reset", __func__); - dev->cfg.guest_feature = 0; - dev->cfg.queue_pfn = 0; - vioblk_update_qa(dev); - dev->cfg.queue_size = 0; - vioblk_update_qs(dev); - dev->cfg.queue_select = 0; - dev->cfg.queue_notify = 0; - dev->cfg.isr_status = 0; - dev->vq[0].last_avail = 0; - vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq); - } - break; - default: - break; - } - } else { - switch (reg) { - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: - switch (sz) { - case 4: - *data = (uint32_t)(dev->sz); - break; - case 2: - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->sz) & 0xFFFF; - break; - case 1: - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz) & 0xFF; - break; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 8) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 16) & 0xFF; - } else if (sz == 2) { - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->sz >> 16) & 0xFFFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 24) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: - switch (sz) { - case 4: - *data = (uint32_t)(dev->sz >> 32); - break; - case 2: - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->sz >> 32) & 0xFFFF; - break; - case 1: - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 32) & 0xFF; - break; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 40) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 6: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 48) & 0xFF; - } else if (sz == 2) { - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->sz >> 48) & 0xFFFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 7: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->sz >> 56) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 8: - switch (sz) { - case 4: - *data = (uint32_t)(dev->max_xfer); - break; - case 2: - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->max_xfer) & 0xFFFF; - break; - case 1: - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->max_xfer) & 0xFF; - break; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 9: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->max_xfer >> 8) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 10: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->max_xfer >> 16) & 0xFF; - } else if (sz == 2) { - *data &= 0xFFFF0000; - *data |= (uint32_t)(dev->max_xfer >> 16) - & 0xFFFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 11: - if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint32_t)(dev->max_xfer >> 24) & 0xFF; - } - /* XXX handle invalid sz */ - break; - case VIRTIO_CONFIG_DEVICE_FEATURES: - *data = dev->cfg.device_feature; - break; - case VIRTIO_CONFIG_GUEST_FEATURES: - *data = dev->cfg.guest_feature; - break; - case VIRTIO_CONFIG_QUEUE_PFN: - *data = dev->cfg.queue_pfn; - break; - case VIRTIO_CONFIG_QUEUE_SIZE: - if (sz == 4) - *data = dev->cfg.queue_size; - else if (sz == 2) { - *data &= 0xFFFF0000; - *data |= (uint16_t)dev->cfg.queue_size; - } else if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint8_t)dev->cfg.queue_size; - } - break; - case VIRTIO_CONFIG_QUEUE_SELECT: - *data = dev->cfg.queue_select; - break; - case VIRTIO_CONFIG_QUEUE_NOTIFY: - *data = dev->cfg.queue_notify; - break; - case VIRTIO_CONFIG_DEVICE_STATUS: - if (sz == 4) - *data = dev->cfg.device_status; - else if (sz == 2) { - *data &= 0xFFFF0000; - *data |= (uint16_t)dev->cfg.device_status; - } else if (sz == 1) { - *data &= 0xFFFFFF00; - *data |= (uint8_t)dev->cfg.device_status; - } - break; - case VIRTIO_CONFIG_ISR_STATUS: - *data = dev->cfg.isr_status; - dev->cfg.isr_status = 0; - vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq); - break; - } - } - return (0); -} - -int -virtio_net_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, - void *cookie, uint8_t sz) -{ - struct vionet_dev *dev = (struct vionet_dev *)cookie; - - *intr = 0xFF; - mutex_lock(&dev->mutex); - - if (dir == 0) { - switch (reg) { - case VIRTIO_CONFIG_DEVICE_FEATURES: - case VIRTIO_CONFIG_QUEUE_SIZE: - case VIRTIO_CONFIG_ISR_STATUS: - log_warnx("%s: illegal write %x to %s", - __progname, *data, virtio_reg_name(reg)); - break; - case VIRTIO_CONFIG_GUEST_FEATURES: - dev->cfg.guest_feature = *data; - break; - case VIRTIO_CONFIG_QUEUE_PFN: - dev->cfg.queue_pfn = *data; - vionet_update_qa(dev); - break; - case VIRTIO_CONFIG_QUEUE_SELECT: - dev->cfg.queue_select = *data; - vionet_update_qs(dev); - break; - case VIRTIO_CONFIG_QUEUE_NOTIFY: - dev->cfg.queue_notify = *data; - if (vionet_notifyq(dev)) - *intr = 1; - break; - case VIRTIO_CONFIG_DEVICE_STATUS: - dev->cfg.device_status = *data; - if (dev->cfg.device_status == 0) { - log_debug("%s: device reset", __func__); - dev->cfg.guest_feature = 0; - dev->cfg.queue_pfn = 0; - vionet_update_qa(dev); - dev->cfg.queue_size = 0; - vionet_update_qs(dev); - dev->cfg.queue_select = 0; - dev->cfg.queue_notify = 0; - dev->cfg.isr_status = 0; - dev->vq[RXQ].last_avail = 0; - dev->vq[RXQ].notified_avail = 0; - dev->vq[TXQ].last_avail = 0; - dev->vq[TXQ].notified_avail = 0; - vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq); - } - break; - default: - break; - } - } else { - switch (reg) { - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI: - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 1: - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 2: - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 3: - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 4: - case VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI + 5: - *data = dev->mac[reg - - VIRTIO_CONFIG_DEVICE_CONFIG_NOMSI]; - break; - case VIRTIO_CONFIG_DEVICE_FEATURES: - *data = dev->cfg.device_feature; - break; - case VIRTIO_CONFIG_GUEST_FEATURES: - *data = dev->cfg.guest_feature; - break; - case VIRTIO_CONFIG_QUEUE_PFN: - *data = dev->cfg.queue_pfn; - break; - case VIRTIO_CONFIG_QUEUE_SIZE: - *data = dev->cfg.queue_size; - break; - case VIRTIO_CONFIG_QUEUE_SELECT: - *data = dev->cfg.queue_select; - break; - case VIRTIO_CONFIG_QUEUE_NOTIFY: - *data = dev->cfg.queue_notify; - break; - case VIRTIO_CONFIG_DEVICE_STATUS: - *data = dev->cfg.device_status; - break; - case VIRTIO_CONFIG_ISR_STATUS: - *data = dev->cfg.isr_status; - dev->cfg.isr_status = 0; - vcpu_deassert_pic_irq(dev->vm_id, 0, dev->irq); - break; - } - } - - mutex_unlock(&dev->mutex); - return (0); -} - -/* - * Must be called with dev->mutex acquired. - */ -void -vionet_update_qa(struct vionet_dev *dev) -{ - struct virtio_vq_info *vq_info; - void *hva = NULL; - - /* Invalid queue? */ - if (dev->cfg.queue_select > 1) - return; - - vq_info = &dev->vq[dev->cfg.queue_select]; - vq_info->q_gpa = (uint64_t)dev->cfg.queue_pfn * VIRTIO_PAGE_SIZE; - - hva = hvaddr_mem(vq_info->q_gpa, vring_size(VIONET_QUEUE_SIZE)); - if (hva == NULL) - fatal("vionet_update_qa"); - vq_info->q_hva = hva; -} - -/* - * Must be called with dev->mutex acquired. - */ -void -vionet_update_qs(struct vionet_dev *dev) -{ - struct virtio_vq_info *vq_info; - - /* Invalid queue? */ - if (dev->cfg.queue_select > 1) { - dev->cfg.queue_size = 0; - return; - } - - vq_info = &dev->vq[dev->cfg.queue_select]; - - /* Update queue pfn/size based on queue select */ - dev->cfg.queue_pfn = vq_info->q_gpa >> 12; - dev->cfg.queue_size = vq_info->qs; -} - -/* - * vionet_enq_rx - * - * Take a given packet from the host-side tap and copy it into the guest's - * buffers utilizing the rx virtio ring. If the packet length is invalid - * (too small or too large) or if there are not enough buffers available, - * the packet is dropped. - * - * Must be called with dev->mutex acquired. - */ -int -vionet_enq_rx(struct vionet_dev *dev, char *pkt, size_t sz, int *spc) -{ - uint16_t dxx, idx, hdr_desc_idx, chain_hdr_idx; - char *vr = NULL; - size_t bufsz = 0, off = 0, pkt_offset = 0, chunk_size = 0; - size_t chain_len = 0; - struct vring_desc *desc, *pkt_desc, *hdr_desc; - struct vring_avail *avail; - struct vring_used *used; - struct virtio_vq_info *vq_info; - struct virtio_net_hdr hdr; - size_t hdr_sz; - - if (sz < VIONET_MIN_TXLEN || sz > VIONET_MAX_TXLEN) { - log_warn("%s: invalid packet size", __func__); - return (0); - } - - hdr_sz = sizeof(hdr); - - if (!(dev->cfg.device_status & VIRTIO_CONFIG_DEVICE_STATUS_DRIVER_OK)) - return (0); - - vq_info = &dev->vq[RXQ]; - vr = vq_info->q_hva; - if (vr == NULL) - fatalx("%s: null vring", __func__); - - /* Compute offsets in ring of descriptors, avail ring, and used ring */ - desc = (struct vring_desc *)(vr); - avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); - used = (struct vring_used *)(vr + vq_info->vq_usedoffset); - - idx = vq_info->last_avail & VIONET_QUEUE_MASK; - if ((vq_info->notified_avail & VIONET_QUEUE_MASK) == idx) { - log_debug("%s: insufficient available buffer capacity, " - "dropping packet.", __func__); - return (0); - } - - hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; - hdr_desc = &desc[hdr_desc_idx]; - - dxx = hdr_desc_idx; - chain_hdr_idx = dxx; - chain_len = 0; - - /* Process the descriptor and walk any potential chain. */ - do { - off = 0; - pkt_desc = &desc[dxx]; - if (!(pkt_desc->flags & VRING_DESC_F_WRITE)) { - log_warnx("%s: invalid descriptor, not writable", - __func__); - return (0); - } - - /* How much data do we get to write? */ - if (sz - bufsz > pkt_desc->len) - chunk_size = pkt_desc->len; - else - chunk_size = sz - bufsz; - - if (chain_len == 0) { - off = hdr_sz; - if (chunk_size == pkt_desc->len) - chunk_size -= off; - } - - /* Write a chunk of data if we need to */ - if (chunk_size && write_mem(pkt_desc->addr + off, - pkt + pkt_offset, chunk_size)) { - log_warnx("%s: failed to write to buffer 0x%llx", - __func__, pkt_desc->addr); - return (0); - } - - chain_len += chunk_size + off; - bufsz += chunk_size; - pkt_offset += chunk_size; - - dxx = pkt_desc->next & VIONET_QUEUE_MASK; - } while (bufsz < sz && pkt_desc->flags & VRING_DESC_F_NEXT); - - /* Move our marker in the ring...*/ - vq_info->last_avail = (vq_info->last_avail + 1) & - VIONET_QUEUE_MASK; - - /* Prepend the virtio net header in the first buffer. */ - memset(&hdr, 0, sizeof(hdr)); - hdr.hdr_len = hdr_sz; - if (write_mem(hdr_desc->addr, &hdr, hdr_sz)) { - log_warnx("vionet: rx enq header write_mem error @ 0x%llx", - hdr_desc->addr); - return (0); - } - - /* Update the index field in the used ring. This must be done last. */ - dev->cfg.isr_status = 1; - *spc = (vq_info->notified_avail - vq_info->last_avail) - & VIONET_QUEUE_MASK; - - /* Update the list of used buffers. */ - used->ring[used->idx & VIONET_QUEUE_MASK].id = chain_hdr_idx; - used->ring[used->idx & VIONET_QUEUE_MASK].len = chain_len; - __sync_synchronize(); - used->idx++; - - return (1); -} - -/* - * vionet_rx - * - * Enqueue data that was received on a tap file descriptor - * to the vionet device queue. - * - * Must be called with dev->mutex acquired. - */ -static int -vionet_rx(struct vionet_dev *dev) -{ - char buf[PAGE_SIZE]; - int num_enq = 0, spc = 0; - struct ether_header *eh; - ssize_t sz; - - do { - sz = read(dev->fd, buf, sizeof(buf)); - if (sz == -1) { - /* - * If we get EAGAIN, No data is currently available. - * Do not treat this as an error. - */ - if (errno != EAGAIN) - log_warn("unexpected read error on vionet " - "device"); - } else if (sz > 0) { - eh = (struct ether_header *)buf; - if (!dev->lockedmac || - ETHER_IS_MULTICAST(eh->ether_dhost) || - memcmp(eh->ether_dhost, dev->mac, - sizeof(eh->ether_dhost)) == 0) - num_enq += vionet_enq_rx(dev, buf, sz, &spc); - } else if (sz == 0) { - log_debug("process_rx: no data"); - break; - } - } while (spc > 0 && sz > 0); - - return (num_enq); -} - -/* - * vionet_rx_event - * - * Called from the event handling thread when new data can be - * received on the tap fd of a vionet device. - */ -static void -vionet_rx_event(int fd, short kind, void *arg) -{ - struct vionet_dev *dev = arg; - - mutex_lock(&dev->mutex); - - if (vionet_rx(dev) > 0) { - /* XXX: vcpu_id */ - vcpu_assert_pic_irq(dev->vm_id, 0, dev->irq); - } - - mutex_unlock(&dev->mutex); -} - -/* - * Must be called with dev->mutex acquired. - */ -void -vionet_notify_rx(struct vionet_dev *dev) -{ - char *vr; - struct vring_avail *avail; - struct virtio_vq_info *vq_info; - - vq_info = &dev->vq[RXQ]; - vr = vq_info->q_hva; - if (vr == NULL) - fatalx("%s: null vring", __func__); - - /* Compute offset into avail ring */ - avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); - vq_info->notified_avail = avail->idx - 1; -} - -/* - * Must be called with dev->mutex acquired. - */ -int -vionet_notifyq(struct vionet_dev *dev) -{ - int ret = 0; - - switch (dev->cfg.queue_notify) { - case RXQ: - vionet_notify_rx(dev); - break; - case TXQ: - ret = vionet_notify_tx(dev); - break; - default: - /* - * Catch the unimplemented queue ID 2 (control queue) as - * well as any bogus queue IDs. - */ - log_debug("%s: notify for unimplemented queue ID %d", - __func__, dev->cfg.queue_notify); - break; - } - - return (ret); -} - -/* - * Must be called with dev->mutex acquired. - */ -int -vionet_notify_tx(struct vionet_dev *dev) -{ - uint16_t idx, pkt_desc_idx, hdr_desc_idx, dxx, cnt; - size_t pktsz, chunk_size = 0; - ssize_t dhcpsz = 0; - int num_enq, ofs, spc = 0; - char *vr = NULL, *pkt = NULL, *dhcppkt = NULL; - struct vring_desc *desc, *pkt_desc, *hdr_desc; - struct vring_avail *avail; - struct vring_used *used; - struct virtio_vq_info *vq_info; - struct ether_header *eh; - - vq_info = &dev->vq[TXQ]; - vr = vq_info->q_hva; - if (vr == NULL) - fatalx("%s: null vring", __func__); - - /* Compute offsets in ring of descriptors, avail ring, and used ring */ - desc = (struct vring_desc *)(vr); - avail = (struct vring_avail *)(vr + vq_info->vq_availoffset); - used = (struct vring_used *)(vr + vq_info->vq_usedoffset); - - num_enq = 0; - - idx = vq_info->last_avail & VIONET_QUEUE_MASK; - - if ((avail->idx & VIONET_QUEUE_MASK) == idx) { - log_debug("%s - nothing to do?", __func__); - return (0); - } - - while ((avail->idx & VIONET_QUEUE_MASK) != idx) { - hdr_desc_idx = avail->ring[idx] & VIONET_QUEUE_MASK; - hdr_desc = &desc[hdr_desc_idx]; - pktsz = 0; - - cnt = 0; - dxx = hdr_desc_idx; - do { - pktsz += desc[dxx].len; - dxx = desc[dxx].next & VIONET_QUEUE_MASK; - - /* - * Virtio 1.0, cs04, section 2.4.5: - * "The number of descriptors in the table is defined - * by the queue size for this virtqueue: this is the - * maximum possible descriptor chain length." - */ - if (++cnt >= VIONET_QUEUE_SIZE) { - log_warnx("%s: descriptor table invalid", - __func__); - goto out; - } - } while (desc[dxx].flags & VRING_DESC_F_NEXT); - - pktsz += desc[dxx].len; - - /* Remove virtio header descriptor len */ - pktsz -= hdr_desc->len; - - /* Drop packets violating device MTU-based limits */ - if (pktsz < VIONET_MIN_TXLEN || pktsz > VIONET_MAX_TXLEN) { - log_warnx("%s: invalid packet size %lu", __func__, - pktsz); - goto drop_packet; - } - pkt = malloc(pktsz); - if (pkt == NULL) { - log_warn("malloc error alloc packet buf"); - goto out; - } - - ofs = 0; - pkt_desc_idx = hdr_desc->next & VIONET_QUEUE_MASK; - pkt_desc = &desc[pkt_desc_idx]; - - while (pkt_desc->flags & VRING_DESC_F_NEXT) { - /* must be not writable */ - if (pkt_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("unexpected writable tx desc " - "%d", pkt_desc_idx); - goto out; - } - - /* Check we don't read beyond allocated pktsz */ - if (pkt_desc->len > pktsz - ofs) { - log_warnx("%s: descriptor len past pkt len", - __func__); - chunk_size = pktsz - ofs; - } else - chunk_size = pkt_desc->len; - - /* Read packet from descriptor ring */ - if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { - log_warnx("vionet: packet read_mem error " - "@ 0x%llx", pkt_desc->addr); - goto out; - } - - ofs += pkt_desc->len; - pkt_desc_idx = pkt_desc->next & VIONET_QUEUE_MASK; - pkt_desc = &desc[pkt_desc_idx]; - } - - /* Now handle tail descriptor - must be not writable */ - if (pkt_desc->flags & VRING_DESC_F_WRITE) { - log_warnx("unexpected writable tx descriptor %d", - pkt_desc_idx); - goto out; - } - - /* Check we don't read beyond allocated pktsz */ - if (pkt_desc->len > pktsz - ofs) { - log_warnx("%s: descriptor len past pkt len", __func__); - chunk_size = pktsz - ofs - pkt_desc->len; - } else - chunk_size = pkt_desc->len; - - /* Read packet from descriptor ring */ - if (read_mem(pkt_desc->addr, pkt + ofs, chunk_size)) { - log_warnx("vionet: packet read_mem error @ " - "0x%llx", pkt_desc->addr); - goto out; - } - - /* reject other source addresses */ - if (dev->lockedmac && pktsz >= ETHER_HDR_LEN && - (eh = (struct ether_header *)pkt) && - memcmp(eh->ether_shost, dev->mac, - sizeof(eh->ether_shost)) != 0) - log_debug("vionet: wrong source address %s for vm %d", - ether_ntoa((struct ether_addr *) - eh->ether_shost), dev->vm_id); - else if (dev->local && - (dhcpsz = dhcp_request(dev, pkt, pktsz, &dhcppkt)) != -1) { - log_debug("vionet: dhcp request," - " local response size %zd", dhcpsz); - - /* XXX signed vs unsigned here, funky cast */ - } else if (write(dev->fd, pkt, pktsz) != (int)pktsz) { - log_warnx("vionet: tx failed writing to tap: " - "%d", errno); - goto out; - } - - drop_packet: - dev->cfg.isr_status = 1; - used->ring[used->idx & VIONET_QUEUE_MASK].id = hdr_desc_idx; - used->ring[used->idx & VIONET_QUEUE_MASK].len = hdr_desc->len; - __sync_synchronize(); - used->idx++; - - vq_info->last_avail = avail->idx & VIONET_QUEUE_MASK; - idx = (idx + 1) & VIONET_QUEUE_MASK; - - num_enq++; - - free(pkt); - pkt = NULL; - } - - if (dhcpsz > 0) - vionet_enq_rx(dev, dhcppkt, dhcpsz, &spc); - -out: - free(pkt); - free(dhcppkt); - - return (1); -} - -int vmmci_ctl(unsigned int cmd) { struct timeval tv = { 0, 0 }; @@ -1678,36 +489,15 @@ virtio_get_base(int fd, char *path, size_t npath, int return -1; } -/* - * Initializes a struct virtio_backing using the list of fds. - */ -static int -virtio_init_disk(struct virtio_backing *file, off_t *sz, - int *fd, size_t nfd, int type) -{ - /* - * probe disk types in order of preference, first one to work wins. - * TODO: provide a way of specifying the type and options. - */ - switch (type) { - case VMDF_RAW: - return virtio_raw_init(file, sz, fd, nfd); - case VMDF_QCOW2: - return virtio_qcow2_init(file, sz, fd, nfd); - } - log_warnx("%s: invalid disk format", __func__); - return -1; -} - void virtio_init(struct vmd_vm *vm, int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) { struct vmop_create_params *vmc = &vm->vm_params; struct vm_create_params *vcp = &vmc->vmc_params; + struct virtio_dev *dev; uint8_t id; - uint8_t i; - int ret; + uint8_t i, j; /* Virtio entropy device */ if (pci_add_device(&id, PCI_VENDOR_QUMRANET, @@ -1737,17 +527,20 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, viornd.irq = pci_get_dev_irq(id); viornd.vm_id = vcp->vcp_id; + SLIST_INIT(&virtio_devs); + if (vmc->vmc_nnics > 0) { - vionet = calloc(vmc->vmc_nnics, sizeof(struct vionet_dev)); - if (vionet == NULL) { + dev = calloc(vmc->vmc_nnics, sizeof(struct virtio_dev)); + if (dev == NULL) { log_warn("%s: calloc failure allocating vionets", __progname); return; } - nr_vionet = vmc->vmc_nnics; /* Virtio network */ for (i = 0; i < vmc->vmc_nnics; i++) { + dev[i].dev_type = VMD_DEVTYPE_NET; + if (pci_add_device(&id, PCI_VENDOR_QUMRANET, PCI_PRODUCT_QUMRANET_VIO_NET, PCI_CLASS_SYSTEM, PCI_SUBCLASS_SYSTEM_MISC, @@ -1757,78 +550,68 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, __progname); return; } + dev[i].pci_id = id; + dev[i].sync_fd = -1; + dev[i].async_fd = -1; + dev[i].vm_id = vcp->vcp_id; + dev[i].vm_vmid = vm->vm_vmid; + dev[i].irq = pci_get_dev_irq(id); - if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_net_io, - &vionet[i])) { + /* The vionet pci bar function is called by the vcpu. */ + if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, + &dev[i])) { log_warnx("%s: can't add bar for virtio net " "device", __progname); return; } - ret = pthread_mutex_init(&vionet[i].mutex, NULL); - if (ret) { - errno = ret; - log_warn("%s: could not initialize mutex " - "for vionet device", __progname); - return; - } - - vionet[i].vq[RXQ].qs = VIONET_QUEUE_SIZE; - vionet[i].vq[RXQ].vq_availoffset = + dev[i].vionet.vq[RXQ].qs = VIONET_QUEUE_SIZE; + dev[i].vionet.vq[RXQ].vq_availoffset = sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; - vionet[i].vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN( + dev[i].vionet.vq[RXQ].vq_usedoffset = VIRTQUEUE_ALIGN( sizeof(struct vring_desc) * VIONET_QUEUE_SIZE + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); - vionet[i].vq[RXQ].last_avail = 0; - vionet[i].vq[RXQ].notified_avail = 0; + dev[i].vionet.vq[RXQ].last_avail = 0; + dev[i].vionet.vq[RXQ].notified_avail = 0; - vionet[i].vq[TXQ].qs = VIONET_QUEUE_SIZE; - vionet[i].vq[TXQ].vq_availoffset = + dev[i].vionet.vq[TXQ].qs = VIONET_QUEUE_SIZE; + dev[i].vionet.vq[TXQ].vq_availoffset = sizeof(struct vring_desc) * VIONET_QUEUE_SIZE; - vionet[i].vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN( + dev[i].vionet.vq[TXQ].vq_usedoffset = VIRTQUEUE_ALIGN( sizeof(struct vring_desc) * VIONET_QUEUE_SIZE + sizeof(uint16_t) * (2 + VIONET_QUEUE_SIZE)); - vionet[i].vq[TXQ].last_avail = 0; - vionet[i].vq[TXQ].notified_avail = 0; - vionet[i].fd = child_taps[i]; - vionet[i].vm_id = vcp->vcp_id; - vionet[i].vm_vmid = vm->vm_vmid; - vionet[i].irq = pci_get_dev_irq(id); + dev[i].vionet.vq[TXQ].last_avail = 0; + dev[i].vionet.vq[TXQ].notified_avail = 0; - event_set(&vionet[i].event, vionet[i].fd, - EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]); - if (event_add(&vionet[i].event, NULL)) { - log_warn("could not initialize vionet event " - "handler"); - return; - } + dev[i].vionet.data_fd = child_taps[i]; /* MAC address has been assigned by the parent */ - memcpy(&vionet[i].mac, &vmc->vmc_macs[i], 6); - vionet[i].cfg.device_feature = VIRTIO_NET_F_MAC; + memcpy(&dev[i].vionet.mac, &vmc->vmc_macs[i], 6); + dev[i].vionet.cfg.device_feature = VIRTIO_NET_F_MAC; - vionet[i].lockedmac = + dev[i].vionet.lockedmac = vmc->vmc_ifflags[i] & VMIFF_LOCKED ? 1 : 0; - vionet[i].local = + dev[i].vionet.local = vmc->vmc_ifflags[i] & VMIFF_LOCAL ? 1 : 0; if (i == 0 && vmc->vmc_bootdevice & VMBOOTDEV_NET) - vionet[i].pxeboot = 1; - vionet[i].idx = i; - vionet[i].pci_id = id; + dev[i].vionet.pxeboot = 1; log_debug("%s: vm \"%s\" vio%u lladdr %s%s%s%s", __func__, vcp->vcp_name, i, - ether_ntoa((void *)vionet[i].mac), - vionet[i].lockedmac ? ", locked" : "", - vionet[i].local ? ", local" : "", - vionet[i].pxeboot ? ", pxeboot" : ""); + ether_ntoa((void *)dev[i].vionet.mac), + dev[i].vionet.lockedmac ? ", locked" : "", + dev[i].vionet.local ? ", local" : "", + dev[i].vionet.pxeboot ? ", pxeboot" : ""); + + /* Add the vionet to our device list. */ + dev[i].vionet.idx = i; + SLIST_INSERT_HEAD(&virtio_devs, &dev[i], dev_next); } } if (vmc->vmc_ndisks > 0) { - nr_vioblk = vmc->vmc_ndisks; - vioblk = calloc(vmc->vmc_ndisks, sizeof(struct vioblk_dev)); - if (vioblk == NULL) { + dev = calloc(vmc->vmc_ndisks, sizeof(struct virtio_dev)); + if (dev == NULL) { log_warn("%s: calloc failure allocating vioblks", __progname); return; @@ -1836,6 +619,8 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, /* One virtio block device for each disk defined in vcp */ for (i = 0; i < vmc->vmc_ndisks; i++) { + dev[i].dev_type = VMD_DEVTYPE_DISK; + if (pci_add_device(&id, PCI_VENDOR_QUMRANET, PCI_PRODUCT_QUMRANET_VIO_BLOCK, PCI_CLASS_MASS_STORAGE, @@ -1846,35 +631,56 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, "device", __progname); return; } - if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_blk_io, - &vioblk[i])) { + dev[i].pci_id = id; + dev[i].sync_fd = -1; + dev[i].async_fd = -1; + dev[i].vm_id = vcp->vcp_id; + dev[i].vm_vmid = vm->vm_vmid; + dev[i].irq = pci_get_dev_irq(id); + + if (pci_add_bar(id, PCI_MAPREG_TYPE_IO, virtio_pci_io, + &dev[i].vioblk)) { log_warnx("%s: can't add bar for virtio block " "device", __progname); return; } - vioblk[i].vq[0].qs = VIOBLK_QUEUE_SIZE; - vioblk[i].vq[0].vq_availoffset = + dev[i].vioblk.vq[0].qs = VIOBLK_QUEUE_SIZE; + dev[i].vioblk.vq[0].vq_availoffset = sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE; - vioblk[i].vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( + dev[i].vioblk.vq[0].vq_usedoffset = VIRTQUEUE_ALIGN( sizeof(struct vring_desc) * VIOBLK_QUEUE_SIZE + sizeof(uint16_t) * (2 + VIOBLK_QUEUE_SIZE)); - vioblk[i].vq[0].last_avail = 0; - vioblk[i].cfg.device_feature = VIRTIO_BLK_F_SIZE_MAX; - vioblk[i].max_xfer = 1048576; - vioblk[i].pci_id = id; - vioblk[i].vm_id = vcp->vcp_id; - vioblk[i].irq = pci_get_dev_irq(id); - if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i], vmc->vmc_diskbases[i], - vmc->vmc_disktypes[i]) == -1) { - log_warnx("%s: unable to determine disk format", - __func__); - return; - } - vioblk[i].sz /= 512; + dev[i].vioblk.vq[0].last_avail = 0; + dev[i].vioblk.cfg.device_feature = + VIRTIO_BLK_F_SIZE_MAX; + dev[i].vioblk.max_xfer = 1048576; + + /* + * Initialize disk fds to an invalid fd (-1), then + * set any child disk fds. + */ + memset(&dev[i].vioblk.disk_fd, -1, + sizeof(dev[i].vioblk.disk_fd)); + dev[i].vioblk.ndisk_fd = vmc->vmc_diskbases[i]; + for (j = 0; j < dev[i].vioblk.ndisk_fd; j++) + dev[i].vioblk.disk_fd[j] = child_disks[i][j]; + + dev[i].vioblk.idx = i; + SLIST_INSERT_HEAD(&virtio_devs, &dev[i], dev_next); } } + /* Wire up child signal handler. */ + signal_set(&ev_sigchld, SIGCHLD, virtio_sighdlr, NULL); + + /* + * Launch virtio devices that support subprocess execution. + */ + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + if (virtio_dev_launch(vm, dev) != 0) + fatalx("failed to launch virtio device"); + } + /* vioscsi cdrom */ if (strlen(vmc->vmc_cdrom)) { vioscsi = calloc(1, sizeof(struct vioscsi_dev)); @@ -1910,15 +716,15 @@ virtio_init(struct vmd_vm *vm, int child_cdrom, + sizeof(uint16_t) * (2 + VIOSCSI_QUEUE_SIZE)); vioscsi->vq[i].last_avail = 0; } - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, - &child_cdrom, 1, VMDF_RAW) == -1) { + if (virtio_raw_init(&vioscsi->file, &vioscsi->sz, &child_cdrom, + 1) == -1) { log_warnx("%s: unable to determine iso format", __func__); return; } vioscsi->locked = 0; vioscsi->lba = 0; - vioscsi->n_blocks = vioscsi->sz >> 11; /* num of 2048 blocks in file */ + vioscsi->n_blocks = vioscsi->sz >> 2; /* num of 2048 blocks in file */ vioscsi->max_xfer = VIOSCSI_BLOCK_SIZE_CDROM; vioscsi->pci_id = id; vioscsi->vm_id = vcp->vcp_id; @@ -1967,27 +773,84 @@ vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx void vionet_set_hostmac(struct vmd_vm *vm, unsigned int idx, uint8_t *addr) { - struct vmop_create_params *vmc = &vm->vm_params; - struct vionet_dev *dev; + struct vmop_create_params *vmc = &vm->vm_params; + struct virtio_dev *dev; + struct vionet_dev *vionet = NULL; + int ret; if (idx > vmc->vmc_nnics) - fatalx("vionet_set_hostmac"); + fatalx("%s: invalid vionet index: %u", __func__, idx); - dev = &vionet[idx]; - memcpy(dev->hostmac, addr, sizeof(dev->hostmac)); + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + if (dev->dev_type == VMD_DEVTYPE_NET + && dev->vionet.idx == idx) { + vionet = &dev->vionet; + break; + } + } + if (vionet == NULL) + fatalx("%s: dev == NULL", __func__); + + /* Set the local vm process copy. */ + memcpy(vionet->hostmac, addr, sizeof(vionet->hostmac)); + + /* Send the information to the device process. */ + ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_HOSTMAC, 0, 0, -1, + vionet->hostmac, sizeof(vionet->hostmac)); + if (ret == -1) { + log_warnx("%s: failed to queue hostmac to vionet dev %u", + __func__, idx); + return; + } } void virtio_shutdown(struct vmd_vm *vm) { - int i; + int ret, status; + pid_t pid = 0; + struct virtio_dev *dev, *tmp; + struct viodev_msg msg; + struct imsgbuf *ibuf; - /* ensure that our disks are synced */ + /* Ensure that our disks are synced. */ if (vioscsi != NULL) vioscsi->file.close(vioscsi->file.p, 0); - for (i = 0; i < nr_vioblk; i++) - vioblk[i].file.close(vioblk[i].file.p, 0); + /* + * Broadcast shutdown to child devices. We need to do this + * synchronously as we have already stopped the async event thread. + */ + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + memset(&msg, 0, sizeof(msg)); + msg.type = VIODEV_MSG_SHUTDOWN; + ibuf = &dev->sync_iev.ibuf; + ret = imsg_compose(ibuf, VIODEV_MSG_SHUTDOWN, 0, 0, -1, + &msg, sizeof(msg)); + if (ret == -1) + fatalx("%s: failed to send shutdown to device", + __func__); + if (imsg_flush(ibuf) == -1) + fatalx("%s: imsg_flush", __func__); + } + + /* + * Wait for all children to shutdown using a simple approach of + * iterating over known child devices and waiting for them to die. + */ + SLIST_FOREACH_SAFE(dev, &virtio_devs, dev_next, tmp) { + log_debug("%s: waiting on device pid %d", __func__, + dev->dev_pid); + do { + pid = waitpid(dev->dev_pid, &status, WNOHANG); + } while (pid == 0 || (pid == -1 && errno == EINTR)); + if (pid == dev->dev_pid) + log_debug("%s: device for pid %d is stopped", + __func__, pid); + else + log_warnx("%s: unexpected pid %d", __func__, pid); + free(dev); + } } int @@ -2042,22 +905,21 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_t { struct vmop_create_params *vmc = &vm->vm_params; struct vm_create_params *vcp = &vmc->vmc_params; + struct virtio_dev *dev; uint8_t i; - int ret; - void *hva = NULL; - nr_vionet = vmc->vmc_nnics; if (vmc->vmc_nnics > 0) { - vionet = calloc(vmc->vmc_nnics, sizeof(struct vionet_dev)); - if (vionet == NULL) { + dev = calloc(vmc->vmc_nnics, sizeof(struct virtio_dev)); + if (dev == NULL) { log_warn("%s: calloc failure allocating vionets", __progname); return (-1); } - log_debug("%s: receiving vionet", __func__); - if (atomicio(read, fd, vionet, - vmc->vmc_nnics * sizeof(struct vionet_dev)) != - vmc->vmc_nnics * sizeof(struct vionet_dev)) { + + log_debug("%s: receiving virtio network devices", __func__); + if (atomicio(read, fd, dev, + vmc->vmc_nnics * sizeof(struct virtio_dev)) != + vmc->vmc_nnics * sizeof(struct virtio_dev)) { log_warnx("%s: error reading vionet from fd", __func__); return (-1); @@ -2065,42 +927,29 @@ vionet_restore(int fd, struct vmd_vm *vm, int *child_t /* Virtio network */ for (i = 0; i < vmc->vmc_nnics; i++) { - if (pci_set_bar_fn(vionet[i].pci_id, 0, virtio_net_io, - &vionet[i])) { + if (dev[i].dev_type != VMD_DEVTYPE_NET) { + log_warnx("%s: invalid device type", __func__); + return (-1); + } + + dev[i].sync_fd = -1; + dev[i].async_fd = -1; + dev[i].vm_id = vcp->vcp_id; + dev[i].vm_vmid = vm->vm_vmid; + dev[i].irq = pci_get_dev_irq(dev[i].pci_id); + + if (pci_set_bar_fn(dev[i].pci_id, 0, virtio_pci_io, + &dev[i])) { log_warnx("%s: can't set bar fn for virtio net " "device", __progname); return (-1); } - memset(&vionet[i].mutex, 0, sizeof(pthread_mutex_t)); - ret = pthread_mutex_init(&vionet[i].mutex, NULL); + dev[i].vionet.data_fd = child_taps[i]; + dev[i].vionet.idx = i; - if (ret) { - errno = ret; - log_warn("%s: could not initialize mutex " - "for vionet device", __progname); - return (-1); - } - vionet[i].fd = child_taps[i]; - vionet[i].vm_id = vcp->vcp_id; - vionet[i].vm_vmid = vm->vm_vmid; - vionet[i].irq = pci_get_dev_irq(vionet[i].pci_id); - - hva = hvaddr_mem(vionet[i].vq[RXQ].q_gpa, - vring_size(VIONET_QUEUE_SIZE)); - if (hva == NULL) - fatal("failed to restore vionet RX virtqueue"); - vionet[i].vq[RXQ].q_hva = hva; - - hva = hvaddr_mem(vionet[i].vq[TXQ].q_gpa, - vring_size(VIONET_QUEUE_SIZE)); - if (hva == NULL) - fatal("failed to restore vionet TX virtqueue"); - vionet[i].vq[TXQ].q_hva = hva; - - memset(&vionet[i].event, 0, sizeof(struct event)); - event_set(&vionet[i].event, vionet[i].fd, - EV_READ | EV_PERSIST, vionet_rx_event, &vionet[i]); + log_info("%s: restored vionet[%d]", __func__, i); + SLIST_INSERT_HEAD(&virtio_devs, &dev[i], dev_next); } } return (0); @@ -2110,44 +959,42 @@ vioblk_restore(int fd, struct vmd_vm *vm, vioblk_restore(int fd, struct vmd_vm *vm, int child_disks[][VM_MAX_BASE_PER_DISK]) { + struct vmop_create_params *vmc = &vm->vm_params; + struct virtio_dev *dev; uint8_t i; - void *hva = NULL; - nr_vioblk = vm->vm_params.vmc_ndisks; - vioblk = calloc(vm->vm_params.vmc_ndisks, sizeof(struct vioblk_dev)); - if (vioblk == NULL) { + dev = calloc(vmc->vmc_ndisks, sizeof(struct virtio_dev)); + if (dev == NULL) { log_warn("%s: calloc failure allocating vioblks", __progname); return (-1); } + log_debug("%s: receiving vioblk", __func__); - if (atomicio(read, fd, vioblk, - nr_vioblk * sizeof(struct vioblk_dev)) != - nr_vioblk * sizeof(struct vioblk_dev)) { + if (atomicio(read, fd, dev, + vmc->vmc_ndisks * sizeof(struct virtio_dev)) != + vmc->vmc_ndisks * sizeof(struct virtio_dev)) { log_warnx("%s: error reading vioblk from fd", __func__); return (-1); } - for (i = 0; i < vm->vm_params.vmc_ndisks; i++) { - if (pci_set_bar_fn(vioblk[i].pci_id, 0, virtio_blk_io, - &vioblk[i])) { + for (i = 0; i < vmc->vmc_ndisks; i++) { + if (dev[i].dev_type != VMD_DEVTYPE_DISK) { + log_warnx("%s: invalid device type", __func__); + return (-1); + } + + dev[i].sync_fd = -1; + dev[i].async_fd = -1; + + if (pci_set_bar_fn(dev[i].pci_id, 0, virtio_pci_io, &dev[i])) { log_warnx("%s: can't set bar fn for virtio block " "device", __progname); return (-1); } - if (virtio_init_disk(&vioblk[i].file, &vioblk[i].sz, - child_disks[i], vm->vm_params.vmc_diskbases[i], - vm->vm_params.vmc_disktypes[i]) == -1) { - log_warnx("%s: unable to determine disk format", - __func__); - return (-1); - } - vioblk[i].vm_id = vm->vm_params.vmc_params.vcp_id; - vioblk[i].irq = pci_get_dev_irq(vioblk[i].pci_id); + dev[i].vm_id = vmc->vmc_params.vcp_id; + dev[i].irq = pci_get_dev_irq(dev[i].pci_id); - hva = hvaddr_mem(vioblk[i].vq[0].q_gpa, - vring_size(VIOBLK_QUEUE_SIZE)); - if (hva == NULL) - fatal("failed to restore vioblk virtqueue"); - vioblk[i].vq[0].q_hva = hva; + dev[i].vioblk.idx = i; + SLIST_INSERT_HEAD(&virtio_devs, &dev[i], dev_next); } return (0); } @@ -2181,11 +1028,6 @@ vioscsi_restore(int fd, struct vmd_vm *vm, int child_c return (-1); } - if (virtio_init_disk(&vioscsi->file, &vioscsi->sz, &child_cdrom, 1, - VMDF_RAW) == -1) { - log_warnx("%s: unable to determine iso format", __func__); - return (-1); - } vioscsi->vm_id = vm->vm_params.vmc_params.vcp_id; vioscsi->irq = pci_get_dev_irq(vioscsi->pci_id); @@ -2205,23 +1047,25 @@ virtio_restore(int fd, struct vmd_vm *vm, int child_cd virtio_restore(int fd, struct vmd_vm *vm, int child_cdrom, int child_disks[][VM_MAX_BASE_PER_DISK], int *child_taps) { + struct virtio_dev *dev; int ret; + SLIST_INIT(&virtio_devs); + if ((ret = viornd_restore(fd, vm)) == -1) return ret; - if ((ret = vioblk_restore(fd, vm, child_disks)) == -1) - return ret; - if ((ret = vioscsi_restore(fd, vm, child_cdrom)) == -1) - return ret; + return (ret); - if ((ret = vionet_restore(fd, vm, child_taps)) == -1) - return ret; - if ((ret = vmmci_restore(fd, vm->vm_params.vmc_params.vcp_id)) == -1) - return ret; + return (ret); + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + if (virtio_dev_launch(vm, dev) != 0) + fatalx("%s: failed to restore virtio dev", __func__); + } + return (0); } @@ -2254,40 +1098,114 @@ vionet_dump(int fd) int vionet_dump(int fd) { - int i; + struct virtio_dev *dev, temp; + struct viodev_msg msg; + struct imsg imsg; + struct imsgbuf *ibuf = NULL; + size_t sz; + int ret; - log_debug("%s: sending vionet", __func__); + log_debug("%s: dumping vionet", __func__); - for (i = 0; i < nr_vionet; i++) { - vionet[i].vq[RXQ].q_hva = NULL; - vionet[i].vq[TXQ].q_hva = NULL; - } + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + if (dev->dev_type != VMD_DEVTYPE_NET) + continue; - if (atomicio(vwrite, fd, vionet, - nr_vionet * sizeof(struct vionet_dev)) != - nr_vionet * sizeof(struct vionet_dev)) { - log_warnx("%s: error writing vionet to fd", __func__); - return (-1); + memset(&msg, 0, sizeof(msg)); + memset(&imsg, 0, sizeof(imsg)); + + ibuf = &dev->sync_iev.ibuf; + msg.type = VIODEV_MSG_DUMP; + + ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + if (ret == -1) { + log_warnx("%s: failed requesting dump of vionet[%d]", + __func__, dev->vionet.idx); + return (-1); + } + if (imsg_flush(ibuf) == -1) { + log_warnx("%s: imsg_flush", __func__); + return (-1); + } + + sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); + if (sz != sizeof(temp)) { + log_warnx("%s: failed to dump vionet[%d]", __func__, + dev->vionet.idx); + return (-1); + } + + temp.vionet.vq[RXQ].q_hva = NULL; + temp.vionet.vq[TXQ].q_hva = NULL; + temp.async_fd = -1; + temp.sync_fd = -1; + memset(&temp.async_iev, 0, sizeof(temp.async_iev)); + memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); + + if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { + log_warnx("%s: error writing vionet to fd", __func__); + return (-1); + } } + return (0); } int vioblk_dump(int fd) { - int i; + struct virtio_dev *dev, temp; + struct viodev_msg msg; + struct imsg imsg; + struct imsgbuf *ibuf = NULL; + size_t sz; + int ret; - log_debug("%s: sending vioblk", __func__); + log_debug("%s: dumping vioblk", __func__); - for (i = 0; i < nr_vioblk; i++) - vioblk[i].vq[0].q_hva = NULL; + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + if (dev->dev_type != VMD_DEVTYPE_DISK) + continue; - if (atomicio(vwrite, fd, vioblk, - nr_vioblk * sizeof(struct vioblk_dev)) != - nr_vioblk * sizeof(struct vioblk_dev)) { - log_warnx("%s: error writing vioblk to fd", __func__); - return (-1); + memset(&msg, 0, sizeof(msg)); + memset(&imsg, 0, sizeof(imsg)); + + ibuf = &dev->sync_iev.ibuf; + msg.type = VIODEV_MSG_DUMP; + + ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + if (ret == -1) { + log_warnx("%s: failed requesting dump of vioblk[%d]", + __func__, dev->vioblk.idx); + return (-1); + } + if (imsg_flush(ibuf) == -1) { + log_warnx("%s: imsg_flush", __func__); + return (-1); + } + + + sz = atomicio(read, dev->sync_fd, &temp, sizeof(temp)); + if (sz != sizeof(temp)) { + log_warnx("%s: failed to dump vioblk[%d]", __func__, + dev->vioblk.idx); + return (-1); + } + + temp.vioblk.vq[0].q_hva = NULL; + temp.async_fd = -1; + temp.sync_fd = -1; + memset(&temp.async_iev, 0, sizeof(temp.async_iev)); + memset(&temp.sync_iev, 0, sizeof(temp.sync_iev)); + + if (atomicio(vwrite, fd, &temp, sizeof(temp)) != sizeof(temp)) { + log_warnx("%s: error writing vioblk to fd", __func__); + return (-1); + } } + return (0); } @@ -2338,12 +1256,15 @@ virtio_stop(struct vmd_vm *vm) void virtio_stop(struct vmd_vm *vm) { - uint8_t i; - for (i = 0; i < vm->vm_params.vmc_nnics; i++) { - if (event_del(&vionet[i].event)) { - log_warn("could not initialize vionet event " - "handler"); - return; + struct virtio_dev *dev; + int ret; + + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_PAUSE_VM, + 0, 0, -1, NULL, 0); + if (ret == -1) { + log_warnx("%s: failed to compose pause msg to device", + __func__); } } } @@ -2351,12 +1272,506 @@ virtio_start(struct vmd_vm *vm) void virtio_start(struct vmd_vm *vm) { - uint8_t i; - for (i = 0; i < vm->vm_params.vmc_nnics; i++) { - if (event_add(&vionet[i].event, NULL)) { - log_warn("could not initialize vionet event " - "handler"); + struct virtio_dev *dev; + int ret; + + SLIST_FOREACH(dev, &virtio_devs, dev_next) { + ret = imsg_compose_event(&dev->async_iev, IMSG_VMDOP_UNPAUSE_VM, + 0, 0, -1, NULL, 0); + if (ret == -1) { + log_warnx("%s: failed to compose start msg to device", + __func__); + } + } +} + +/* + * Fork+exec a child virtio device. Returns 0 on success. + */ +static int +virtio_dev_launch(struct vmd_vm *vm, struct virtio_dev *dev) +{ + char *nargv[7], num[32], t[2]; + pid_t dev_pid; + int data_fds[VM_MAX_BASE_PER_DISK], sync_fds[2], async_fds[2], ret = 0; + size_t i, j, data_fds_sz, sz = 0; + struct virtio_dev *d = NULL; + struct viodev_msg msg; + struct imsg imsg; + struct imsgev *iev = &dev->sync_iev; + + switch (dev->dev_type) { + case VMD_DEVTYPE_NET: + data_fds[0] = dev->vionet.data_fd; + data_fds_sz = 1; + log_info("%s: launching vionet[%d]", + vm->vm_params.vmc_params.vcp_name, dev->vionet.idx); + break; + case VMD_DEVTYPE_DISK: + memcpy(&data_fds, dev->vioblk.disk_fd, sizeof(data_fds)); + data_fds_sz = dev->vioblk.ndisk_fd; + log_info("%s: launching vioblk[%d]", + vm->vm_params.vmc_params.vcp_name, dev->vioblk.idx); + break; + /* NOTREACHED */ + default: + log_warn("%s: invalid device type", __func__); + return (EINVAL); + } + + /* We need two channels: one synchronous (IO reads) and one async. */ + if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, sync_fds) == -1) { + log_warn("failed to create socketpair"); + return (errno); + } + if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, async_fds) == -1) { + log_warn("failed to create async socketpair"); + return (errno); + } + + /* Keep communication channels open after exec. */ + if (fcntl(sync_fds[1], F_SETFD, 0)) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto err; + } + if (fcntl(async_fds[1], F_SETFD, 0)) { + ret = errno; + log_warn("%s: fcnt", __func__); + goto err; + } + + /* Keep data file descriptors open after exec. */ + for (i = 0; i < data_fds_sz; i++) { + log_debug("%s: marking fd %d !close-on-exec", __func__, + data_fds[i]); + if (fcntl(data_fds[i], F_SETFD, 0)) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto err; + } + } + + /* Fork... */ + dev_pid = fork(); + if (dev_pid == -1) { + ret = errno; + log_warn("%s: fork failed", __func__); + goto err; + } + + if (dev_pid > 0) { + /* Parent */ + close_fd(sync_fds[1]); + close_fd(async_fds[1]); + + /* Save the child's pid to help with cleanup. */ + dev->dev_pid = dev_pid; + + /* Set the channel fds to the child's before sending. */ + dev->sync_fd = sync_fds[1]; + dev->async_fd = async_fds[1]; + + /* Close data fds. Only the child device needs them now. */ + for (i = 0; i < data_fds_sz; i++) + close_fd(data_fds[i]); + + /* Set our synchronous channel to non-blocking. */ + if (fcntl(sync_fds[0], F_SETFL, O_NONBLOCK) == -1) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto err; + } + + /* 1. Send over our configured device. */ + log_info("%s: sending '%c' type device struct", __func__, + dev->dev_type); + sz = atomicio(vwrite, sync_fds[0], dev, sizeof(*dev)); + if (sz != sizeof(*dev)) { + log_warnx("%s: failed to send device", __func__); + ret = EIO; + goto err; + } + + /* 2. Send over details on the VM (including memory fds). */ + log_info("%s: sending vm message for '%s'", __func__, + vm->vm_params.vmc_params.vcp_name); + sz = atomicio(vwrite, sync_fds[0], vm, sizeof(*vm)); + if (sz != sizeof(*vm)) { + log_warnx("%s: failed to send vm details", __func__); + ret = EIO; + goto err; + } + + /* + * Initialize our imsg channel to the child device. The initial + * communication will be synchronous. We expect the child to + * report itself "ready" to confirm the launch was a success. + */ + imsg_init(&iev->ibuf, sync_fds[0]); + do + ret = imsg_read(&iev->ibuf); + while (ret == -1 && errno == EAGAIN); + if (ret == 0 || ret == -1) { + log_warnx("%s: failed to receive ready message from " + "'%c' type device", __func__, dev->dev_type); + ret = EIO; + goto err; + } + ret = 0; + + log_info("%s: receiving reply", __func__); + if (imsg_get(&iev->ibuf, &imsg) < 1) { + log_warnx("%s: imsg_get", __func__); + ret = EIO; + goto err; + } + IMSG_SIZE_CHECK(&imsg, &msg); + memcpy(&msg, imsg.data, sizeof(msg)); + imsg_free(&imsg); + + if (msg.type != VIODEV_MSG_READY) { + log_warnx("%s: expected ready message, got type %d", + __func__, msg.type); + ret = EINVAL; + goto err; + } + log_info("%s: device reports ready via sync channel", + __func__); + + /* + * Wire in the async event handling, but after reverting back + * to the parent's fd's. + */ + dev->sync_fd = sync_fds[0]; + dev->async_fd = async_fds[0]; + vm_device_pipe(dev, virtio_dispatch_dev); + } else { + /* Child */ + close_fd(async_fds[0]); + close_fd(sync_fds[0]); + + /* + * Close any other device fd's we know aren't + * ours. This releases any exclusive locks held on + * things like disk images. + */ + SLIST_FOREACH(d, &virtio_devs, dev_next) { + if (d == dev) + continue; + + switch (d->dev_type) { + case VMD_DEVTYPE_DISK: + for (j = 0; j < d->vioblk.ndisk_fd; j++) + close_fd(d->vioblk.disk_fd[j]); + break; + case VMD_DEVTYPE_NET: + close_fd(d->vionet.data_fd); + break; + default: + fatalx("%s: invalid device type '%c'", + __func__, d->dev_type); + } + } + + memset(num, 0, sizeof(num)); + snprintf(num, sizeof(num), "%d", sync_fds[1]); + + t[0] = dev->dev_type; + t[1] = '\0'; + + nargv[0] = env->argv0; + nargv[1] = "-X"; + nargv[2] = num; + nargv[3] = "-t"; + nargv[4] = t; + nargv[5] = "-n"; + nargv[6] = NULL; + + /* Control resumes in vmd.c:main(). */ + execvp(nargv[0], nargv); + + ret = errno; + log_warn("%s: failed to exec device", __func__); + _exit(ret); + /* NOTREACHED */ + } + + return (ret); + +err: + close_fd(sync_fds[0]); + close_fd(sync_fds[1]); + close_fd(async_fds[0]); + close_fd(async_fds[1]); + return (ret); +} + +/* + * Initialize an async imsg channel for a virtio device. + */ +int +vm_device_pipe(struct virtio_dev *dev, void (*cb)(int, short, void *)) +{ + struct imsgev *iev = &dev->async_iev; + int fd = dev->async_fd; + + log_info("%s: initializing '%c' device pipe (fd=%d)", __func__, + dev->dev_type, fd); + + if (fcntl(fd, F_SETFL, O_NONBLOCK) == -1) { + log_warn("failed to set nonblocking mode on vm device pipe"); + return (-1); + } + + imsg_init(&iev->ibuf, fd); + iev->handler = cb; + iev->data = dev; + iev->events = EV_READ; + imsg_event_add(iev); + + return (0); +} + +void +virtio_dispatch_dev(int fd, short event, void *arg) +{ + struct virtio_dev *dev = (struct virtio_dev*)arg; + struct imsgev *iev = &dev->async_iev; + struct imsgbuf *ibuf = &iev->ibuf; + struct imsg imsg; + struct viodev_msg msg; + ssize_t n = 0; + + if (event & EV_READ) { + if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) + fatal("%s: imsg_read", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_READ)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); return; } } + + if (event & EV_WRITE) { + if ((n = msgbuf_write(&ibuf->w)) == -1 && errno != EAGAIN) + fatal("%s: msgbuf_write", __func__); + if (n == 0) { + /* this pipe is dead, so remove the event handler */ + log_debug("%s: pipe dead (EV_WRITE)", __func__); + event_del(&iev->ev); + event_loopexit(NULL); + return; + } + } + + for (;;) { + if ((n = imsg_get(ibuf, &imsg)) == -1) + fatal("%s: imsg_get", __func__); + if (n == 0) + break; + + switch (imsg.hdr.type) { + case IMSG_DEVOP_MSG: + IMSG_SIZE_CHECK(&imsg, &msg); + memcpy(&msg, imsg.data, sizeof(msg)); + handle_dev_msg(&msg, dev); + break; + default: + log_warnx("%s: got non devop imsg %d", __func__, + imsg.hdr.type); + break; + } + imsg_free(&imsg); + } + imsg_event_add(iev); } + + +static int +handle_dev_msg(struct viodev_msg *msg, struct virtio_dev *gdev) +{ + uint32_t vm_id = gdev->vm_id; + int irq = gdev->irq; + + switch (msg->type) { + case VIODEV_MSG_KICK: + if (msg->state == INTR_STATE_ASSERT) + vcpu_assert_pic_irq(vm_id, msg->vcpu, irq); + else if (msg->state == INTR_STATE_DEASSERT) + vcpu_deassert_pic_irq(vm_id, msg->vcpu, irq); + break; + case VIODEV_MSG_READY: + log_debug("%s: device reports ready", __func__); + break; + case VIODEV_MSG_ERROR: + log_warnx("%s: device reported error", __func__); + break; + case VIODEV_MSG_INVALID: + case VIODEV_MSG_IO_READ: + case VIODEV_MSG_IO_WRITE: + /* FALLTHROUGH */ + default: + log_warnx("%s: unsupported device message type %d", __func__, + msg->type); + return (1); + } + + return (0); +}; + +/* + * Called by the VM process while processing IO from the VCPU thread. + * + * N.b. Since the VCPU thread calls this function, we cannot mutate the event + * system. All ipc messages must be sent manually and cannot be queued for + * the event loop to push them. (We need to perform a synchronous read, so + * this isn't really a big deal.) + */ +int +virtio_pci_io(int dir, uint16_t reg, uint32_t *data, uint8_t *intr, + void *cookie, uint8_t sz) +{ + struct virtio_dev *dev = (struct virtio_dev *)cookie; + struct imsgbuf *ibuf = &dev->sync_iev.ibuf; + struct imsg imsg; + struct viodev_msg msg; + ssize_t n; + int ret = 0; + + memset(&msg, 0, sizeof(msg)); + msg.reg = reg; + msg.io_sz = sz; + + if (dir == 0) { + msg.type = VIODEV_MSG_IO_WRITE; + msg.data = *data; + msg.data_valid = 1; + } else + msg.type = VIODEV_MSG_IO_READ; + + if (msg.type == VIODEV_MSG_IO_WRITE) { + /* + * Write request. No reply expected. + */ + ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + if (ret == -1) { + log_warn("%s: failed to send async io event to vionet" + " device", __func__); + return (ret); + } + if (imsg_flush(ibuf) == -1) { + log_warnx("%s: imsg_flush (write)", __func__); + return (-1); + } + } else { + /* + * Read request. Requires waiting for a reply. + */ + ret = imsg_compose(ibuf, IMSG_DEVOP_MSG, 0, 0, -1, &msg, + sizeof(msg)); + if (ret == -1) { + log_warnx("%s: failed to send sync io event to vionet" + " device", __func__); + return (ret); + } + if (imsg_flush(ibuf) == -1) { + log_warnx("%s: imsg_flush (read)", __func__); + return (-1); + } + + /* Read our reply. */ + do + n = imsg_read(ibuf); + while (n == -1 && errno == EAGAIN); + if (n == 0 || n == -1) { + log_warn("%s: imsg_read (n=%ld)", __func__, n); + return (-1); + } + if ((n = imsg_get(ibuf, &imsg)) == -1) { + log_warn("%s: imsg_get (n=%ld)", __func__, n); + return (-1); + } + if (n == 0) { + log_warnx("%s: invalid imsg", __func__); + return (-1); + } + + IMSG_SIZE_CHECK(&imsg, &msg); + memcpy(&msg, imsg.data, sizeof(msg)); + imsg_free(&imsg); + + if (msg.type == VIODEV_MSG_IO_READ && msg.data_valid) { + log_debug("%s: got sync read response (reg=%s)", + __func__, virtio_reg_name(msg.reg)); + *data = msg.data; + /* + * It's possible we're asked to {de,}assert after the + * device performs a register read. + */ + if (msg.state == INTR_STATE_ASSERT) + vcpu_assert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); + else if (msg.state == INTR_STATE_DEASSERT) + vcpu_deassert_pic_irq(dev->vm_id, msg.vcpu, msg.irq); + } else { + log_warnx("%s: expected IO_READ, got %d", __func__, + msg.type); + return (-1); + } + } + + return (0); +} + +static void +virtio_sighdlr(int sig, short event, void *arg) +{ + pid_t pid; + int status; + + switch (sig) { + case SIGCHLD: + pid = waitpid(-1, &status, WNOHANG); + log_info("%s: child %d died.", __func__, pid); + break; + } +} + +void +virtio_assert_pic_irq(struct virtio_dev *dev, int vcpu) +{ + struct viodev_msg msg; + int ret; + + memset(&msg, 0, sizeof(msg)); + msg.irq = dev->irq; + msg.vcpu = vcpu; + msg.type = VIODEV_MSG_KICK; + msg.state = INTR_STATE_ASSERT; + + ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, + &msg, sizeof(msg)); + if (ret == -1) + log_warnx("%s: failed to assert irq %d", __func__, dev->irq); +} + +void +virtio_deassert_pic_irq(struct virtio_dev *dev, int vcpu) +{ + struct viodev_msg msg; + int ret; + + memset(&msg, 0, sizeof(msg)); + msg.irq = dev->irq; + msg.vcpu = vcpu; + msg.type = VIODEV_MSG_KICK; + msg.state = INTR_STATE_DEASSERT; + + ret = imsg_compose_event(&dev->async_iev, IMSG_DEVOP_MSG, 0, 0, -1, + &msg, sizeof(msg)); + if (ret == -1) + log_warnx("%s: failed to deassert irq %d", __func__, dev->irq); +} blob - f090ed5c38311dd036d7ffd58cfd2da3a8cc6701 blob + 8a7e99da880fe648fa302f2b680ccf2c951a780f --- usr.sbin/vmd/virtio.h +++ usr.sbin/vmd/virtio.h @@ -63,12 +63,42 @@ */ #define VIRTIO_MAX_QUEUES 3 +#define MAXPHYS (64 * 1024) /* max raw I/O transfer size */ + /* * Rename the address config register to be more descriptive. */ #define VIRTIO_CONFIG_QUEUE_PFN VIRTIO_CONFIG_QUEUE_ADDRESS /* + * VM <-> Device messaging. + */ +struct viodev_msg { + uint8_t type; +#define VIODEV_MSG_INVALID 0 +#define VIODEV_MSG_READY 1 +#define VIODEV_MSG_ERROR 2 +#define VIODEV_MSG_KICK 3 +#define VIODEV_MSG_IO_READ 4 +#define VIODEV_MSG_IO_WRITE 5 +#define VIODEV_MSG_DUMP 6 +#define VIODEV_MSG_SHUTDOWN 7 + + uint16_t reg; /* VirtIO register */ + uint8_t io_sz; /* IO instruction size */ + uint8_t vcpu; /* VCPU id */ + uint8_t irq; /* IRQ number */ + + int8_t state; /* Interrupt state toggle (if any) */ +#define INTR_STATE_ASSERT 1 +#define INTR_STATE_NOOP 0 +#define INTR_STATE_DEASSERT -1 + + uint32_t data; /* Data (if any) */ + uint8_t data_valid; /* 1 if data field is populated. */ +} __packed; + +/* * This struct stores notifications from a virtio driver. There is * one such struct per virtio device. */ @@ -177,16 +207,15 @@ struct vioblk_dev { struct vioblk_dev { struct virtio_io_cfg cfg; - struct virtio_vq_info vq[VIRTIO_MAX_QUEUES]; struct virtio_backing file; - uint64_t sz; + int disk_fd[VM_MAX_BASE_PER_DISK]; /* fds for disk image(s) */ + uint8_t ndisk_fd; /* number of valid disk fds */ + uint64_t sz; /* size in 512 byte sectors */ uint32_t max_xfer; - uint8_t pci_id; - int irq; - uint32_t vm_id; + unsigned int idx; }; /* vioscsi will use at least 3 queues - 5.6.2 Virtqueues @@ -218,26 +247,40 @@ struct vionet_dev { }; struct vionet_dev { - pthread_mutex_t mutex; - struct event event; - struct virtio_io_cfg cfg; - struct virtio_vq_info vq[VIRTIO_MAX_QUEUES]; - int fd; - uint32_t vm_id; - uint32_t vm_vmid; - int irq; + int data_fd; /* fd for our tap device */ + uint8_t mac[6]; uint8_t hostmac[6]; - - int idx; int lockedmac; int local; int pxeboot; + unsigned int idx; +}; + +struct virtio_dev { + union { + struct vioblk_dev vioblk; + struct vionet_dev vionet; + }; + + struct imsgev async_iev; + struct imsgev sync_iev; + + int sync_fd; /* fd for synchronous channel */ + int async_fd; /* fd for async channel */ + uint8_t pci_id; + uint32_t vm_id; + uint32_t vm_vmid; + int irq; + + pid_t dev_pid; + char dev_type; + SLIST_ENTRY(virtio_dev) dev_next; }; struct virtio_net_hdr { @@ -290,7 +333,12 @@ uint32_t vring_size(uint32_t); int virtio_dump(int); int virtio_restore(int, struct vmd_vm *, int, int[][VM_MAX_BASE_PER_DISK], int *); +const char *virtio_reg_name(uint8_t); uint32_t vring_size(uint32_t); +int vm_device_pipe(struct virtio_dev *, void (*)(int, short, void *)); +int virtio_pci_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); +void virtio_assert_pic_irq(struct virtio_dev *, int); +void virtio_deassert_pic_irq(struct virtio_dev *, int); int virtio_rnd_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int viornd_dump(int); @@ -305,21 +353,19 @@ int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t * int virtio_raw_create(const char *, uint64_t); int virtio_raw_init(struct virtio_backing *, off_t *, int*, size_t); -int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vioblk_dump(int); int vioblk_restore(int, struct vmd_vm *, int[][VM_MAX_BASE_PER_DISK]); void vioblk_update_qs(struct vioblk_dev *); void vioblk_update_qa(struct vioblk_dev *); int vioblk_notifyq(struct vioblk_dev *); -int virtio_net_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); int vionet_dump(int); int vionet_restore(int, struct vmd_vm *, int *); void vionet_update_qs(struct vionet_dev *); void vionet_update_qa(struct vionet_dev *); -int vionet_notifyq(struct vionet_dev *); -void vionet_notify_rx(struct vionet_dev *); -int vionet_notify_tx(struct vionet_dev *); +int vionet_notifyq(struct virtio_dev *); +void vionet_notify_rx(struct virtio_dev *); +int vionet_notify_tx(struct virtio_dev *); void vionet_process_rx(uint32_t); int vionet_enq_rx(struct vionet_dev *, char *, size_t, int *); void vionet_set_hostmac(struct vmd_vm *, unsigned int, uint8_t *); @@ -336,7 +382,7 @@ ssize_t dhcp_request(struct vionet_dev *, char *, size int vioscsi_restore(int, struct vmd_vm *, int); /* dhcp.c */ -ssize_t dhcp_request(struct vionet_dev *, char *, size_t, char **); +ssize_t dhcp_request(struct virtio_dev *, char *, size_t, char **); /* vioscsi.c */ int vioscsi_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t); blob - 319cee8d6e3f2f50fe7c3fe36f7c39d6b1dae97a blob + 1ae60983755be95bd182de1b3df4ff07a2acf779 --- usr.sbin/vmd/vm.c +++ usr.sbin/vmd/vm.c @@ -81,8 +81,8 @@ int alloc_guest_mem(struct vm_create_params *); int vcpu_exit(struct vm_run_params *); int vcpu_reset(uint32_t, uint32_t, struct vcpu_reg_state *); void create_memory_map(struct vm_create_params *); -int alloc_guest_mem(struct vm_create_params *); static int vmm_create_vm(struct vmd_vm *); +int alloc_guest_mem(struct vmd_vm *); void init_emulated_hw(struct vmop_create_params *, int, int[][VM_MAX_BASE_PER_DISK], int *); void restore_emulated_hw(struct vm_create_params *, int, int *, @@ -230,8 +230,8 @@ vm_main(int fd) /* * We aren't root, so we can't chroot(2). Use unveil(2) instead. */ - if (unveil("/var/empty", "") == -1) - fatal("unveil /var/empty"); + if (unveil(env->argv0, "x") == -1) + fatal("unveil %s", env->argv0); if (unveil(NULL, NULL) == -1) fatal("unveil lock"); @@ -239,10 +239,12 @@ vm_main(int fd) * pledge in the vm processes: * stdio - for malloc and basic I/O including events. * vmm - for the vmm ioctls and operations. + * proc exec - fork/exec for launching devices. * recvfd - for vm send/recv and sending fd to devices. - * proc - required for vmm(4) VMM_IOC_CREATE ioctl + * tmppath/rpath - for shm_mkstemp, ftruncate, unlink */ - if (pledge("stdio vmm recvfd proc", NULL) == -1) + if (pledge("stdio vmm proc exec recvfd " + "tmppath rpath", NULL) == -1) fatal("pledge"); /* Receive our vm configuration. */ @@ -372,7 +374,7 @@ start_vm(struct vmd_vm *vm, int fd) if (!(vm->vm_state & VM_STATE_RECEIVED)) create_memory_map(vcp); - ret = alloc_guest_mem(&vm->vm_params.vmc_params); + ret = alloc_guest_mem(vm); if (ret) { struct rlimit lim; char buf[FMT_SCALED_STRSIZE]; @@ -395,10 +397,6 @@ start_vm(struct vmd_vm *vm, int fd) return (ret); } - /* Tighten pledge now that we've called VMM_IOC_CREATE ioctl. */ - if (pledge("stdio vmm recvfd", NULL) == -1) - fatal("pledge"); - /* * Some of vmd currently relies on global state (current_vm, con_fd). */ @@ -487,15 +485,19 @@ start_vm(struct vmd_vm *vm, int fd) nicfds[i] = vm->vm_ifs[i].vif_fd; if (vm->vm_state & VM_STATE_RECEIVED) { + restore_mem(vm->vm_receive_fd, vcp); restore_emulated_hw(vcp, vm->vm_receive_fd, nicfds, vm->vm_disks, vm->vm_cdrom); - restore_mem(vm->vm_receive_fd, vcp); if (restore_vm_params(vm->vm_receive_fd, vcp)) fatal("restore vm params failed"); unpause_vm(vm); } else init_emulated_hw(vmc, vm->vm_cdrom, vm->vm_disks, nicfds); + /* Drop privleges further before starting the vcpu run loop(s). */ + if (pledge("stdio vmm recvfd", NULL) == -1) + fatal("pledge"); + /* * Execute the vcpu run loop(s) for this VM. */ @@ -697,6 +699,10 @@ send_vm(int fd, struct vmd_vm *vm) } } + /* Dump memory before devices to aid in restoration. */ + if ((ret = dump_mem(fd, vm))) + goto err; + if ((ret = i8253_dump(fd))) goto err; if ((ret = i8259_dump(fd))) @@ -1086,31 +1092,67 @@ alloc_guest_mem(struct vm_create_params *vcp) * !0: failure - errno indicating the source of the failure */ int -alloc_guest_mem(struct vm_create_params *vcp) +alloc_guest_mem(struct vmd_vm *vm) { void *p; - int ret; + char *tmp; + int fd, ret = 0; size_t i, j; + struct vm_create_params *vcp = &vm->vm_params.vmc_params; struct vm_mem_range *vmr; + tmp = calloc(32, sizeof(char)); + if (tmp == NULL) { + ret = errno; + log_warn("%s: calloc", __func__); + return (ret); + } + strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); + + vm->vm_nmemfds = vcp->vcp_nmemranges; + for (i = 0; i < vcp->vcp_nmemranges; i++) { vmr = &vcp->vcp_memranges[i]; + + fd = shm_mkstemp(tmp); + if (fd < 0) { + ret = errno; + log_warn("%s: shm_mkstemp", __func__); + return (ret); + } + if (ftruncate(fd, vmr->vmr_size) == -1) { + ret = errno; + log_warn("%s: ftruncate", __func__); + goto out; + } + if (fcntl(fd, F_SETFD, 0) == -1) { + ret = errno; + log_warn("%s: fcntl", __func__); + goto out; + } + if (shm_unlink(tmp) == -1) { + ret = errno; + log_warn("%s: shm_unlink", __func__); + goto out; + } + strlcpy(tmp, "/tmp/vmd.XXXXXXXXXX", 32); + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); + MAP_SHARED | MAP_CONCEAL, fd, 0); if (p == MAP_FAILED) { ret = errno; for (j = 0; j < i; j++) { vmr = &vcp->vcp_memranges[j]; munmap((void *)vmr->vmr_va, vmr->vmr_size); } - - return (ret); + goto out; } - + vm->vm_memfds[i] = fd; vmr->vmr_va = (vaddr_t)p; } - - return (0); +out: + free(tmp); + return (ret); } /* @@ -2499,3 +2541,60 @@ vm_pipe_recv(struct vm_dev_pipe *p) return msg; } + +/* + * Re-map the guest address space using the shared memory file descriptor. + * + * Returns 0 on success, non-zero in event of failure. + */ +int +remap_guest_mem(struct vmd_vm *vm) +{ + struct vm_create_params *vcp; + struct vm_mem_range *vmr; + size_t i, j; + void *p = NULL; + int ret; + + if (vm == NULL) + return (1); + + vcp = &vm->vm_params.vmc_params; + + /* + * We've execve'd, so we need to re-map the guest VM memory. Iterate + * over all possible vm_mem_range entries so we can initialize all + * file descriptors to a value. + */ + for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { + if (i < vcp->vcp_nmemranges) { + vmr = &vcp->vcp_memranges[i]; + /* Skip ranges we know we don't need right now. */ + if (vmr->vmr_type == VM_MEM_MMIO) { + log_debug("%s: skipping range i=%ld, type=%d", + __func__, i, vmr->vmr_type); + vm->vm_memfds[i] = -1; + continue; + } + /* Re-mmap the memrange. */ + p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_CONCEAL, vm->vm_memfds[i], 0); + if (p == MAP_FAILED) { + ret = errno; + log_warn("%s: mmap", __func__); + for (j = 0; j < i; j++) { + vmr = &vcp->vcp_memranges[j]; + munmap((void *)vmr->vmr_va, + vmr->vmr_size); + } + return (ret); + } + vmr->vmr_va = (vaddr_t)p; + } else { + /* Initialize with an invalid fd. */ + vm->vm_memfds[i] = -1; + } + } + + return (0); +} blob - 1246c91eb61eff2a9b2ce96a2e3ab8140c2327c7 blob + f8af05f15a619dbb799bde46d36f0ae0ccd324f0 --- usr.sbin/vmd/vmd.c +++ usr.sbin/vmd/vmd.c @@ -774,13 +774,14 @@ main(int argc, char **argv) int proc_instance = 0, vm_launch = 0, vm_fd = -1; const char *errp, *title = NULL; int argc0 = argc; + char dev_type = '\0'; log_init(0, LOG_DAEMON); if ((env = calloc(1, sizeof(*env))) == NULL) fatal("calloc: env"); - while ((ch = getopt(argc, argv, "D:P:I:V:df:vn")) != -1) { + while ((ch = getopt(argc, argv, "D:P:I:V:X:df:nt:v")) != -1) { switch (ch) { case 'D': if (cmdline_symset(optarg) < 0) @@ -812,13 +813,28 @@ main(int argc, char **argv) if (errp) fatalx("invalid process instance"); break; - /* child vm fork/exec */ + /* child vm and device fork/exec */ case 'V': vm_launch = VMD_LAUNCH_VM; vm_fd = strtonum(optarg, 0, 128, &errp); if (errp) fatalx("invalid vm fd"); break; + case 'X': + vm_launch = VMD_LAUNCH_DEV; + vm_fd = strtonum(optarg, 0, 128, &errp); + if (errp) + fatalx("invalid device fd"); + break; + case 't': + dev_type = *optarg; + switch (dev_type) { + case VMD_DEVTYPE_NET: + case VMD_DEVTYPE_DISK: + break; + default: fatalx("invalid device type"); + } + break; default: usage(); } @@ -865,6 +881,15 @@ main(int argc, char **argv) if (vm_launch == VMD_LAUNCH_VM) { vm_main(vm_fd); /* NOTREACHED */ + } else if (vm_launch == VMD_LAUNCH_DEV) { + if (dev_type == VMD_DEVTYPE_NET) { + vionet_main(vm_fd); + /* NOTREACHED */ + } else if (dev_type == VMD_DEVTYPE_DISK) { + vioblk_main(vm_fd); + /* NOTREACHED */ + } + fatalx("unsupported device type '%c'", dev_type); } /* Open /dev/vmm early. */ blob - 00becd961c4ea7932f0bd08562bc796656b0f2a6 blob + 1e837e0990bccf58e8fa47b8be513e443e3f9b2b --- usr.sbin/vmd/vmd.h +++ usr.sbin/vmd/vmd.h @@ -71,7 +71,11 @@ /* Launch mode identifiers for when a vm fork+exec's. */ #define VMD_LAUNCH_VM 1 +#define VMD_LAUNCH_DEV 2 +#define VMD_DEVTYPE_NET 'n' +#define VMD_DEVTYPE_DISK 'd' + /* Rate-limit fast reboots */ #define VM_START_RATE_SEC 6 /* min. seconds since last reboot */ #define VM_START_RATE_LIMIT 3 /* max. number of fast reboots */ @@ -135,7 +139,10 @@ enum imsg_type { IMSG_VMDOP_VM_SHUTDOWN, IMSG_VMDOP_VM_REBOOT, IMSG_VMDOP_CONFIG, - IMSG_VMDOP_DONE + IMSG_VMDOP_DONE, + /* Device Operation Messages */ + IMSG_DEVOP_HOSTMAC, + IMSG_DEVOP_MSG, }; struct vmop_result { @@ -317,6 +324,9 @@ struct vmd_vm { struct timeval vm_start_tv; int vm_start_limit; + int vm_memfds[VMM_MAX_MEM_RANGES]; + size_t vm_nmemfds; + TAILQ_ENTRY(vmd_vm) vm_entry; }; TAILQ_HEAD(vmlist, vmd_vm); @@ -484,6 +494,7 @@ void* hvaddr_mem(paddr_t, size_t); enum pipe_msg_type vm_pipe_recv(struct vm_dev_pipe *); int write_mem(paddr_t, const void *buf, size_t); void* hvaddr_mem(paddr_t, size_t); +int remap_guest_mem(struct vmd_vm *); /* config.c */ int config_init(struct vmd *); @@ -510,4 +521,10 @@ int virtio_get_base(int, char *, size_t, int, const c /* virtio.c */ int virtio_get_base(int, char *, size_t, int, const char *); +/* vionet.c */ +__dead void vionet_main(int); + +/* vioblk.c */ +__dead void vioblk_main(int); + #endif /* VMD_H */