vmd(8) users of tech@,
NOTE: I have no intention to try to commit this prior to 6.9's release
due to its complexity, but I didn't want to "wait" to solicit testers or
potential feedback.
I noticed recently that I could not have two vmctl(8) clients "wait" for
the same vm to shutdown as one would cancel the other. Worse yet, if you
cancel a wait (^C) you can effectively corrupt the state being used for
tracking the waiting client preventing future clients from waiting on
the vm.
It turns out the socket fd of the vmctl(8) client is being sent by the
control process as the peerid in the imsg. This fd is being stored on
the vmd_vm structure in the vm_peerid member, but this fd only has
meaning in the scope of the control process. Consequently:
- only 1 value can be stored at a time, meaning only 1 waiting client
can exist at a time
- since vm_peerid is used for storing if another vmd(8) process is
waiting on the vm, vm_peerid can be corrupted by vmctl(8)
- the control process cannot update waiting state on vmctl disconnects
and since fd's are reused it's possible the message could be sent to a
vmctl(8) client performing an operation other than a "wait"
The below diff:
1. enables support for multiple vmctl(8) clients to wait on the same vm
to terminate
2. keeps the wait state in the control process and out of the parent's
global vm state, tracking waiting parties in a TAILQ
3. removes the waiting client state on client disconnect/cancellation
4. simplifies vmd(8) by removing IMSG_VMDOP_WAIT_VM_REQUEST handling
from the vmm process, which isn't needed (and was partially
responsible for the corruption)
There are some subsequent tweaks that may follow this diff, specifically
one related to the fact I've switched the logic to send
IMSG_VMDOP_TERMINATE_VM_EVENT messages to the control process (which
makes sense to me) but control relays a IMSG_VMDOP_TERMINATE_VM_RESPONSE
message to the waiting vmctl(8) client. I'd need to update vmctl(8) to
look for the other event and don't want to complicate the diff further.
If any testers out there can try to break this for me it would be much
appreciated. :-)
-Dave
Index: control.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/control.c,v
retrieving revision 1.33
diff -u -p -r1.33 control.c
--- control.c 11 Apr 2021 18:53:23 -0000 1.33
+++ control.c 12 Apr 2021 11:20:08 -0000
@@ -39,7 +39,15 @@
#define CONTROL_BACKLOG 5
-struct ctl_connlist ctl_conns;
+struct ctl_notify {
+ int ctl_fd;
+ uint32_t ctl_vmid;
+ TAILQ_ENTRY(ctl_notify) entry;
+};
+TAILQ_HEAD(ctl_notifylist, ctl_notify);
+
+struct ctl_notifylist notifylist;
+struct ctl_connlist ctl_conns;
void
control_accept(int, short, void *);
@@ -78,7 +86,10 @@ int
control_dispatch_vmd(int fd, struct privsep_proc *p, struct imsg *imsg)
{
struct ctl_conn *c;
+ struct ctl_notify *notify, *notify_next;
struct privsep *ps = p->p_ps;
+ struct vmop_result vmr;
+ int waiting = 0;
switch (imsg->hdr.type) {
case IMSG_VMDOP_START_VM_RESPONSE:
@@ -86,11 +97,11 @@ control_dispatch_vmd(int fd, struct priv
case IMSG_VMDOP_SEND_VM_RESPONSE:
case IMSG_VMDOP_RECEIVE_VM_RESPONSE:
case IMSG_VMDOP_UNPAUSE_VM_RESPONSE:
- case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
case IMSG_VMDOP_GET_INFO_VM_DATA:
case IMSG_VMDOP_GET_INFO_VM_END_DATA:
case IMSG_CTL_FAIL:
case IMSG_CTL_OK:
+ /* Provide basic response back to a specific control client */
if ((c = control_connbyfd(imsg->hdr.peerid)) == NULL) {
log_warnx("%s: lost control connection: fd %d",
__func__, imsg->hdr.peerid);
@@ -99,6 +110,50 @@ control_dispatch_vmd(int fd, struct priv
imsg_compose_event(&c->iev, imsg->hdr.type,
0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg));
break;
+ case IMSG_VMDOP_TERMINATE_VM_RESPONSE:
+ IMSG_SIZE_CHECK(imsg, &vmr);
+ memcpy(&vmr, imsg->data, sizeof(vmr));
+
+ if ((c = control_connbyfd(imsg->hdr.peerid)) != NULL) {
+ TAILQ_FOREACH(notify, ¬ifylist, entry) {
+ if (notify->ctl_fd == (int) imsg->hdr.peerid) {
+ /* Update if waiting by vm name */
+ if (notify->ctl_vmid < 1)
+ notify->ctl_vmid = vmr.vmr_id;
+ waiting = 1;
+ break;
+ }
+ }
+ } else {
+ log_warnx("%s: lost control connection: fd %d",
+ __func__, imsg->hdr.peerid);
+ return (0);
+ }
+
+ /* An error needs to be relayed to the client immediately */
+ if (!waiting || vmr.vmr_result)
+ imsg_compose_event(&c->iev, imsg->hdr.type,
+ 0, 0, imsg->fd, imsg->data, IMSG_DATA_SIZE(imsg));
+ break;
+ case IMSG_VMDOP_TERMINATE_VM_EVENT:
+ /* Notify any waiting clients that a VM terminated */
+ IMSG_SIZE_CHECK(imsg, &vmr);
+ memcpy(&vmr, imsg->data, sizeof(vmr));
+
+ TAILQ_FOREACH_SAFE(notify, ¬ifylist, entry, notify_next) {
+ if (notify->ctl_vmid != vmr.vmr_id)
+ continue;
+ if ((c = control_connbyfd(notify->ctl_fd)) != NULL) {
+ /* XXX vmctl expects *_RESPONSE, not *_EVENT */
+ imsg_compose_event(&c->iev,
+ IMSG_VMDOP_TERMINATE_VM_RESPONSE,
+ 0, 0, imsg->fd, imsg->data,
+ IMSG_DATA_SIZE(imsg));
+ TAILQ_REMOVE(¬ifylist, notify, entry);
+ free(notify);
+ }
+ }
+ break;
case IMSG_VMDOP_CONFIG:
config_getconfig(ps->ps_env, imsg);
proc_compose(ps, PROC_PARENT, IMSG_VMDOP_DONE, NULL, 0);
@@ -123,6 +178,9 @@ control_init(struct privsep *ps, struct
if (cs->cs_name == NULL)
return (0);
+ TAILQ_INIT(&ctl_conns);
+ TAILQ_INIT(¬ifylist);
+
if ((fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0)) == -1) {
log_warn("%s: socket", __func__);
return (-1);
@@ -276,7 +334,8 @@ control_connbyfd(int fd)
void
control_close(int fd, struct control_sock *cs)
{
- struct ctl_conn *c;
+ struct ctl_conn *c;
+ struct ctl_notify *notify, *notify_next;
if ((c = control_connbyfd(fd)) == NULL) {
log_warn("%s: fd %d: not found", __func__, fd);
@@ -286,6 +345,14 @@ control_close(int fd, struct control_soc
msgbuf_clear(&c->iev.ibuf.w);
TAILQ_REMOVE(&ctl_conns, c, entry);
+ TAILQ_FOREACH_SAFE(notify, ¬ifylist, entry, notify_next) {
+ if (notify->ctl_fd == fd) {
+ TAILQ_REMOVE(¬ifylist, notify, entry);
+ free(notify);
+ break;
+ }
+ }
+
event_del(&c->iev.ev);
close(c->iev.ibuf.fd);
@@ -308,7 +375,8 @@ control_dispatch_imsg(int fd, short even
struct imsg imsg;
struct vmop_create_params vmc;
struct vmop_id vid;
- int n, v, ret = 0;
+ struct ctl_notify *notify;
+ int n, v, wait = 0, ret = 0;
if ((c = control_connbyfd(fd)) == NULL) {
log_warn("%s: fd %d: not found", __func__, fd);
@@ -388,11 +456,25 @@ control_dispatch_imsg(int fd, short even
}
break;
case IMSG_VMDOP_WAIT_VM_REQUEST:
+ wait = 1;
+ /* FALLTHROUGH */
case IMSG_VMDOP_TERMINATE_VM_REQUEST:
if (IMSG_DATA_SIZE(&imsg) < sizeof(vid))
goto fail;
memcpy(&vid, imsg.data, sizeof(vid));
vid.vid_uid = c->peercred.uid;
+
+ if (wait || vid.vid_flags & VMOP_WAIT) {
+ vid.vid_flags |= VMOP_WAIT;
+ notify = calloc(1, sizeof(struct ctl_notify));
+ if (notify == NULL)
+ fatal("%s: calloc", __func__);
+ notify->ctl_vmid = vid.vid_id;
+ notify->ctl_fd = fd;
+ TAILQ_INSERT_TAIL(¬ifylist, notify, entry);
+ log_debug("%s: registered wait for peer %d",
+ __func__, fd);
+ }
if (proc_compose_imsg(ps, PROC_PARENT, -1,
imsg.hdr.type, fd, -1, &vid, sizeof(vid)) == -1) {
Index: vmd.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
retrieving revision 1.122
diff -u -p -r1.122 vmd.c
--- vmd.c 5 Apr 2021 11:35:26 -0000 1.122
+++ vmd.c 12 Apr 2021 11:20:08 -0000
@@ -128,42 +128,42 @@ vmd_dispatch_control(int fd, struct priv
IMSG_SIZE_CHECK(imsg, &vid);
memcpy(&vid, imsg->data, sizeof(vid));
flags = vid.vid_flags;
+ cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
if ((id = vid.vid_id) == 0) {
/* Lookup vm (id) by name */
if ((vm = vm_getbyname(vid.vid_name)) == NULL) {
res = ENOENT;
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
break;
} else if ((vm->vm_state & VM_STATE_SHUTDOWN) &&
(flags & VMOP_FORCE) == 0) {
res = EALREADY;
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
break;
} else if (!(vm->vm_state & VM_STATE_RUNNING)) {
res = EINVAL;
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
break;
}
id = vm->vm_vmid;
} else if ((vm = vm_getbyvmid(id)) == NULL) {
res = ENOENT;
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
break;
}
if (vm_checkperm(vm, &vm->vm_params.vmc_owner,
- vid.vid_uid) != 0) {
+ vid.vid_uid) != 0) {
res = EPERM;
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
break;
}
- memset(&vid, 0, sizeof(vid));
- vid.vid_id = id;
- vid.vid_flags = flags;
- if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
- imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
- return (-1);
+ /* Only relay TERMINATION requests, not WAIT requests */
+ if (imsg->hdr.type == IMSG_VMDOP_TERMINATE_VM_REQUEST) {
+ memset(&vid, 0, sizeof(vid));
+ vid.vid_id = id;
+ vid.vid_flags = flags;
+
+ if (proc_compose_imsg(ps, PROC_VMM, -1, imsg->hdr.type,
+ imsg->hdr.peerid, -1, &vid, sizeof(vid)) == -1)
+ return (-1);
+ }
break;
case IMSG_VMDOP_GET_INFO_VM_REQUEST:
proc_forward_imsg(ps, imsg, PROC_VMM, -1);
@@ -478,16 +478,13 @@ vmd_dispatch_vmm(int fd, struct privsep_
config_setvm(ps, vm, (uint32_t)-1, vm->vm_uid);
}
- /* Send a response if a control client is waiting for it */
- if (imsg->hdr.peerid != (uint32_t)-1) {
- /* the error is meaningless for deferred responses */
- vmr.vmr_result = 0;
+ /* The error is meaningless for deferred responses */
+ vmr.vmr_result = 0;
- if (proc_compose_imsg(ps, PROC_CONTROL, -1,
- IMSG_VMDOP_TERMINATE_VM_RESPONSE,
- imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
- return (-1);
- }
+ if (proc_compose_imsg(ps, PROC_CONTROL, -1,
+ IMSG_VMDOP_TERMINATE_VM_EVENT,
+ imsg->hdr.peerid, -1, &vmr, sizeof(vmr)) == -1)
+ return (-1);
break;
case IMSG_VMDOP_GET_INFO_VM_DATA:
IMSG_SIZE_CHECK(imsg, &vir);
Index: vmm.c
===================================================================
RCS file: /cvs/src/usr.sbin/vmd/vmm.c,v
retrieving revision 1.100
diff -u -p -r1.100 vmm.c
--- vmm.c 11 Apr 2021 21:02:40 -0000 1.100
+++ vmm.c 12 Apr 2021 11:20:08 -0000
@@ -150,30 +150,6 @@ vmm_dispatch_parent(int fd, struct privs
res = ENOENT;
cmd = IMSG_VMDOP_START_VM_RESPONSE;
break;
- case IMSG_VMDOP_WAIT_VM_REQUEST:
- IMSG_SIZE_CHECK(imsg, &vid);
- memcpy(&vid, imsg->data, sizeof(vid));
- id = vid.vid_id;
-
- DPRINTF("%s: recv'ed WAIT_VM for %d", __func__, id);
-
- cmd = IMSG_VMDOP_TERMINATE_VM_RESPONSE;
- if (id == 0) {
- res = ENOENT;
- } else if ((vm = vm_getbyvmid(id)) != NULL) {
- if (vm->vm_peerid != (uint32_t)-1) {
- peerid = vm->vm_peerid;
- res = EINTR;
- } else
- cmd = 0;
- vm->vm_peerid = imsg->hdr.peerid;
- } else {
- /* vm doesn't exist, cannot stop vm */
- log_debug("%s: cannot stop vm that is not running",
- __func__);
- res = VMD_VM_STOP_INVALID;
- }
- break;
case IMSG_VMDOP_TERMINATE_VM_REQUEST:
IMSG_SIZE_CHECK(imsg, &vid);
memcpy(&vid, imsg->data, sizeof(vid));
@@ -221,15 +197,6 @@ vmm_dispatch_parent(int fd, struct privs
__func__);
res = VMD_VM_STOP_INVALID;
}
- }
- if ((flags & VMOP_WAIT) &&
- res == 0 && (vm->vm_state & VM_STATE_SHUTDOWN)) {
- if (vm->vm_peerid != (uint32_t)-1) {
- peerid = vm->vm_peerid;
- res = EINTR;
- } else
- cmd = 0;
- vm->vm_peerid = imsg->hdr.peerid;
}
} else {
/* VM doesn't exist, cannot stop vm */