On Fri, Apr 07, 2023 at 12:07:07PM -0400, Dave Voutila wrote:
>
> Dave Voutila <d...@sisu.io> writes:
>
> > In vmd, the vmm process forks to create the resulting vm process. After
> > this fork, the vmm parent process closes all the file descriptors
> > pointing to the vm's devices (cdrom, kernel, disks, nics, etc.).
> >
> > The logic was a bit funky, so this change relies on the fact we can
> > attempt the close(2) call and use its success/failure to determine if we
> > have an fd to mark -1 in the vm structure. (I guess we could just
> > blindly set them to -1 post-close, but this feels more sensical to me.)
> >
> > While in the function, it cleans up some commentary and logging around
> > the actual fork(2) call. Since fork(2) sets errno, we can use it in the
> > log message, too.
> >
> > This reduces some noise in my upcoming diff to introduce execvp to the
> > forking of child vm processes (as presented at AsiaBSDCon).
> >
>
> Touch longer, but won't generate ktrace noise by blind calls to close(2)
> and also accounts for the other error conditions (EINTR, EIO).
>
> For EIO, not sure yet how we want to handle it other than log it.
>
> For EINTR, we want to account for that race and make sure we retry since
> the vmm process is long-lived and could inadvertently keep things like
> tty fds or disk image fds open after the guest vm terminates.
>
> I need to check on other calls to close(2) in vmd, but most fds are
> passed via imsg, so I'd presume any interruption is handled in the
> kernel. (Someone correct me if I'm wrong.)
>
>

as we discussed this effort previously in .jp, ok mlarkin@

> diff refs/heads/master refs/heads/vmd-vmm-fd-dance
> commit - 8371c6b4d5765a69d520f93d64f57c6a2989f9ae
> commit + 9c07553a618956ba89f29b93906dcd6ea6c19de8
> blob - 75af37a29a6cd4917d8c4ca9be35c48772314f4b
> blob + e0af71dac5d730c7b88166384a95b94bf14fcfc4
> --- usr.sbin/vmd/vmd.c
> +++ usr.sbin/vmd/vmd.c
> @@ -1956,3 +1956,21 @@ vm_terminate(struct vmd_vm *vm, const char *caller)
>               vm_remove(vm, caller);
>       }
>  }
> +
> +int
> +close_fd(int fd)
> +{
> +     int     ret;
> +
> +     if (fd == -1)
> +             return (0);
> +
> +     do
> +             ret = close(fd);
> +     while (ret == -1 && errno == EINTR);
> +
> +     if (ret == -1 && errno == EIO)
> +             log_warn("%s(%d)", __func__, fd);
> +
> +     return (ret);
> +}
> blob - 7bbbf62734bc7cd575c4fee384193037b0495bb4
> blob + 7228ace4b31a9fd6b5eb90bf1d048198a03a04fc
> --- usr.sbin/vmd/vmd.h
> +++ usr.sbin/vmd/vmd.h
> @@ -443,6 +443,7 @@ void       getmonotime(struct timeval *);
>  uint32_t prefixlen2mask(uint8_t);
>  void  prefixlen2mask6(u_int8_t, struct in6_addr *);
>  void  getmonotime(struct timeval *);
> +int   close_fd(int);
>
>  /* priv.c */
>  void  priv(struct privsep *, struct privsep_proc *);
> blob - d9eff3c8f703854c7b1e49fee04b8e426956cfbb
> blob + 7db920beec7e34a63f5ba3602f17185eec33d3f7
> --- usr.sbin/vmd/vmm.c
> +++ usr.sbin/vmd/vmm.c
> @@ -641,12 +641,10 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>       if (socketpair(AF_UNIX, SOCK_STREAM, PF_UNSPEC, fds) == -1)
>               fatal("socketpair");
>
> -     /* Start child vmd for this VM (fork, chroot, drop privs) */
> +     /* Fork the vmm process to create the vm, inheriting open device fds. */
>       vm_pid = fork();
> -
> -     /* Start child failed? - cleanup and leave */
>       if (vm_pid == -1) {
> -             log_warnx("%s: start child failed", __func__);
> +             log_warn("%s: fork child failed", __func__);
>               ret = EIO;
>               goto err;
>       }
> @@ -654,31 +652,24 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>       if (vm_pid > 0) {
>               /* Parent */
>               vm->vm_pid = vm_pid;
> -             close(fds[1]);
> +             close_fd(fds[1]);
>
>               for (i = 0 ; i < vcp->vcp_ndisks; i++) {
>                       for (j = 0; j < VM_MAX_BASE_PER_DISK; j++) {
> -                             if (vm->vm_disks[i][j] != -1)
> -                                     close(vm->vm_disks[i][j]);
> -                             vm->vm_disks[i][j] = -1;
> +                             if (close_fd(vm->vm_disks[i][j]) == 0)
> +                                 vm->vm_disks[i][j] = -1;
>                       }
>               }
>               for (i = 0 ; i < vcp->vcp_nnics; i++) {
> -                     close(vm->vm_ifs[i].vif_fd);
> -                     vm->vm_ifs[i].vif_fd = -1;
> +                     if (close_fd(vm->vm_ifs[i].vif_fd) == 0)
> +                         vm->vm_ifs[i].vif_fd = -1;
>               }
> -             if (vm->vm_kernel != -1) {
> -                     close(vm->vm_kernel);
> +             if (close_fd(vm->vm_kernel) == 0)
>                       vm->vm_kernel = -1;
> -             }
> -             if (vm->vm_cdrom != -1) {
> -                     close(vm->vm_cdrom);
> +             if (close_fd(vm->vm_cdrom) == 0)
>                       vm->vm_cdrom = -1;
> -             }
> -             if (vm->vm_tty != -1) {
> -                     close(vm->vm_tty);
> +             if (close_fd(vm->vm_tty) == 0)
>                       vm->vm_tty = -1;
> -             }
>
>               /* Read back the kernel-generated vm id from the child */
>               sz = atomicio(read, fds[0], &vcp->vcp_id, sizeof(vcp->vcp_id));
> @@ -702,8 +693,8 @@ vmm_start_vm(struct imsg *imsg, uint32_t *id, pid_t *p
>               return (0);
>       } else {
>               /* Child */
> -             close(fds[0]);
> -             close(PROC_PARENT_SOCK_FILENO);
> +             close_fd(fds[0]);
> +             close_fd(PROC_PARENT_SOCK_FILENO);
>
>               ret = start_vm(vm, fds[1]);
>

Reply via email to