Ludovic Courtès <l...@gnu.org> skribis:

> This is due to a race condition: the process terminates before its
> service goes from ‘starting’ to ‘running’.
>
> By the time the service controller calls ‘monitor-service-process’, the
> process has already terminated, so the process monitor replies 0 to the
> 'await request because that process no longer exists.

Fixed in 88cc5a43bf04c13b00b15a8d93cb635d9b64713c by introducing an
atomic fork + monitor primitive.

For the record, I also considered a hacky but less intrusive solution:
adding support for “zombies” in the process monitor, whereby it would
record the status of terminated processes and give that (instead of 0)
when somebody awaits them (patch attached).

It’s fragile though because unlike real zombies, we cannot guarantee
that the PID won’t be reused in the meantime; it’s just unlikely.  (PID
FDs would help, but that’s not portable so best if we can avoid it.)

Ludo’.

diff --git a/modules/shepherd/service.scm b/modules/shepherd/service.scm
index 221998b..fec773c 100644
--- a/modules/shepherd/service.scm
+++ b/modules/shepherd/service.scm
@@ -2345,25 +2345,35 @@ may never terminate, even after sending it SIGKILL---e.g., kthreadd on Linux."
           (const #f)))
       (const #f)))
 
+(define %max-zombie-processes
+  ;; Number of processes recorded as "zombies" by the process
+  ;; monitor--processes that had no waiters when they terminated.
+  20)
+
 (define (process-monitor channel)
   "Run a process monitor that handles requests received over @var{channel}."
-  (let loop ((waiters vlist-null))
+  (let loop ((waiters vlist-null)
+             (zombies (ring-buffer %max-zombie-processes)))
     (match (get-message channel)
       (('handle-process-termination pid status)
-       ;; Notify any waiters.
-       (vhash-foldv* (lambda (waiter _)
-                       (put-message waiter status)
-                       #t)
-                     #t pid waiters)
-
-       ;; XXX: The call below is linear in the size of WAITERS, but WAITERS is
-       ;; usually empty or small.
-       (loop (vhash-fold (lambda (key value result)
-                           (if (= key pid)
-                               result
-                               (vhash-consv key value result)))
-                         vlist-null
-                         waiters)))
+       ;; Notify any waiters.  When there are none, record PID and STATUS in
+       ;; the ZOMBIES buffer in case somebody asks for it later--
+       (let ((zombie? (vhash-foldv* (lambda (waiter _)
+                                      (put-message waiter status)
+                                      #f)
+                                    #t pid waiters)))
+
+         ;; XXX: The call below is linear in the size of WAITERS, but WAITERS
+         ;; is usually empty or small.
+         (loop (vhash-fold (lambda (key value result)
+                             (if (= key pid)
+                                 result
+                                 (vhash-consv key value result)))
+                           vlist-null
+                           waiters)
+               (if zombie?
+                   (ring-buffer-insert (cons pid status) zombies)
+                   zombies))))
 
       (('spawn arguments service reply)
        ;; Spawn the command as specified by ARGUMENTS; send the spawn result
@@ -2378,22 +2388,31 @@ may never terminate, even after sending it SIGKILL---e.g., kthreadd on Linux."
          (put-message reply result)
          (match result
            (('exception . _)
-            (loop waiters))
+            (loop waiters zombies))
            (('success (pid))
-            (loop (vhash-consv pid reply waiters))))))
+            (loop (vhash-consv pid reply waiters) zombies)))))
 
       (('await pid reply)
        ;; Await the termination of PID and send its status on REPLY.
        (if (and (catch-system-error (kill pid 0))
                 (not (pseudo-process? pid)))
-           (loop (vhash-consv pid reply waiters))
-           (begin                             ;PID is gone or a pseudo-process
-             ;; This might be a race condition (the caller thinks PID is up
-             ;; when it's already dead) so log it.
-             (local-output (l10n "Awaiting PID ~a, which is already gone.")
-                           pid)
-             (put-message reply 0)
-             (loop waiters)))))))
+           (loop (vhash-consv pid reply waiters) zombies)
+           ;; PID is gone or a pseudo-process; in the former case, check if we
+           ;; have info about it in ZOMBIES (this is linear in the length of
+           ;; ZOMBIES, which is small.)
+           (let* ((zombies* (ring-buffer->list zombies))
+                  (status (assoc-ref zombies* pid)))
+             (unless status
+               ;; This might be a race condition (the caller thinks PID is up
+               ;; when it's already dead) so log it.
+               (local-output (l10n "Awaiting PID ~a, which is already gone.")
+                             pid))
+             (put-message reply (or status 0))
+             (loop waiters
+                   (if status
+                       (list->ring-buffer (alist-delete pid zombies*)
+                                          %max-zombie-processes)
+                       zombies))))))))
 
 (define spawn-process-monitor
   (essential-task-launcher 'process-monitor process-monitor))

Reply via email to