When a highly multi-threaded program such as nbdcopy encounters an error, there is a race condition in the library which can cause an assertion failure and thus a core dump:
(1) An error occurs on one of the threads. nbdcopy calls exit(3). (2) In lib/errors.c, the destructor calls pthread_key_delete. (3) Another thread which is still running also encounters an error, and inside libnbd the library calls set_error(). (4) The call to set_error() calls pthread_getspecific. This is undefined behavior per POSIX, since the key has already been destroyed in step (2), but glibc handles it by returning NULL with an EINVAL error (POSIX recommends, but can't mandate, that course of action for technical reasons). In any case, the error message is lost, and any subsequent call to nbd_get_error() will return NULL. (5) We enter the %DEAD state, where there is an assertion that nbd_get_error() is not NULL. This assertion is supposed to be for checking that the code called set_error() before entering the %DEAD state. Although we did call set_error(), the error message was lost at step (4) and nbd_get_error() will always return NULL. This assertion failure causes a crash. There aren't any good ways to fix this. We have proven that the assertions are too tight (it IS possible for one thread's first use of libnbd API, which causes the allocation of a thread-local error context, to occur after another thread has already triggered the destructor via exit(3)), so remove those. Meanwhile, since POSIX says any use of a destroyed key is undefined, add some atomic bookkeeping such that we track a reference count of how many threads have allocated an error context and subsequently had the per-thread data destructor called. Only call pthread_key_delete() if all threads had the chance to exit; leaking small amounts of memory is preferable to undefined behavior. Affected tests that now produce the new leak message: - copy/copy-nbd-error.sh (known issue - we probably want to improve nbdcopy to gracefully issue NBD_CMD_DISC on error, rather than outright calling exit(), to work around the assertion failure in nbdkit 1.32.5) - various fuse/* (not sure if we can address that; cleaning up FUSE is tricky) This reverts a small part of commit 831cbf7ee14c ("generator: Allow DEAD state actions to run"). Thanks: Richard W.M. Jones <rjo...@redhat.com> Signed-off-by: Eric Blake <ebl...@redhat.com> --- Taking Rich's analysis and weakening of the assertions, but tweaking the errors.c to avoid undefined behavior. generator/state_machine_generator.ml | 5 +-- generator/states.c | 1 - lib/errors.c | 50 ++++++++++++++++++---------- 3 files changed, 34 insertions(+), 22 deletions(-) diff --git a/generator/state_machine_generator.ml b/generator/state_machine_generator.ml index 43c1ce4c..9f94bed1 100644 --- a/generator/state_machine_generator.ml +++ b/generator/state_machine_generator.ml @@ -449,10 +449,7 @@ let pr " abort (); /* Should never happen, but keeps GCC happy. */\n"; pr " }\n"; pr "\n"; - pr " if (r == -1) {\n"; - pr " assert (nbd_get_error () != NULL);\n"; - pr " return -1;\n"; - pr " }\n"; + pr " if (r == -1) return -1;\n"; pr " } while (!blocked);\n"; pr " return 0;\n"; pr "}\n"; diff --git a/generator/states.c b/generator/states.c index fa0f8d71..58a896ca 100644 --- a/generator/states.c +++ b/generator/states.c @@ -192,7 +192,6 @@ STATE_MACHINE { DEAD: /* The caller should have used set_error() before reaching here */ - assert (nbd_get_error ()); abort_option (h); nbd_internal_abort_commands (h, &h->cmds_to_issue); nbd_internal_abort_commands (h, &h->cmds_in_flight); diff --git a/lib/errors.c b/lib/errors.c index 8b77650e..133c752b 100644 --- a/lib/errors.c +++ b/lib/errors.c @@ -34,6 +34,8 @@ struct last_error { /* Thread-local storage of the last error. */ static pthread_key_t errors_key; +/* Zero if errors_key is invalid, else 1 + threads using errors_key. */ +static _Atomic int key_use_count; static void free_errors_key (void *vp) LIBNBD_ATTRIBUTE_NONNULL (1); @@ -51,6 +53,7 @@ errors_key_create (void) strerror (err)); abort (); } + key_use_count++; } /* Destroy the thread-local key when the library is unloaded. */ @@ -59,16 +62,16 @@ static void errors_key_destroy (void) __attribute__ ((destructor)); static void errors_key_destroy (void) { - struct last_error *last_error = pthread_getspecific (errors_key); - /* "No destructor functions shall be invoked by * pthread_key_delete(). Any destructor function that may have been * associated with key shall no longer be called upon thread exit." + * If any threads were still using the key, the memory they allocated + * will be leaked; this is unusual enough to diagnose. */ - if (last_error != NULL) { - free (last_error->error); - free (last_error); - } + if (key_use_count > 1) + fprintf (stderr, "%s: %s: %d threads leaking thread-local data\n", + "libnbd", "errors_key_destroy", key_use_count - 1); + key_use_count = 0; pthread_key_delete (errors_key); } @@ -80,6 +83,8 @@ free_errors_key (void *vp) { struct last_error *last_error = vp; + assert (key_use_count > 1); + key_use_count--; free (last_error->error); free (last_error); } @@ -87,16 +92,23 @@ free_errors_key (void *vp) static struct last_error * allocate_last_error_on_demand (void) { - struct last_error *last_error = pthread_getspecific (errors_key); + struct last_error *last_error = NULL; - if (!last_error) { - last_error = calloc (1, sizeof *last_error); - if (last_error) { - int err = pthread_setspecific (errors_key, last_error); - if (err != 0) { - /* This is not supposed to happen (XXX). */ - fprintf (stderr, "%s: %s: %s\n", "libnbd", "pthread_setspecific", - strerror (err)); + if (key_use_count) { + last_error = pthread_getspecific (errors_key); + if (!last_error) { + last_error = calloc (1, sizeof *last_error); + if (last_error) { + int err = pthread_setspecific (errors_key, last_error); + key_use_count++; + if (err != 0) { + /* This is not supposed to happen (XXX). */ + fprintf (stderr, "%s: %s: %s\n", "libnbd", "pthread_setspecific", + strerror (err)); + free (last_error); + last_error = NULL; + key_use_count--; + } } } } @@ -146,8 +158,10 @@ nbd_internal_get_error_context (void) const char * nbd_get_error (void) { - struct last_error *last_error = pthread_getspecific (errors_key); + struct last_error *last_error = NULL; + if (key_use_count) + last_error = pthread_getspecific (errors_key); if (!last_error) return NULL; return last_error->error; @@ -156,8 +170,10 @@ nbd_get_error (void) int nbd_get_errno (void) { - struct last_error *last_error = pthread_getspecific (errors_key); + struct last_error *last_error = NULL; + if (key_use_count) + last_error = pthread_getspecific (errors_key); if (!last_error) return 0; return last_error->errnum; -- 2.39.2 _______________________________________________ Libguestfs mailing list Libguestfs@redhat.com https://listman.redhat.com/mailman/listinfo/libguestfs