The deadlock was found using this backtrace

Thread 5:
0  __lll_lock_wait () at ../sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:135
1  0x00007efc6b650023 in __GI___pthread_mutex_lock 
(mutex=mutex@entry=0x55fc37128400 <worker_lock>) at 
../nptl/pthread_mutex_lock.c:78
2  0x000055fc36efbe05 in run_as_destroy_worker () at runas.c:1233
3  0x000055fc36efc2e7 in run_as_restart_worker (worker=<optimized out>) at 
runas.c:998
4  run_as (cmd=cmd@entry=RUN_AS_UNLINK, data=data@entry=0x7efc5b7fa630, 
ret_value=ret_value@entry=0x7efc5b7fa510, uid=uid@entry=1000, 
gid=gid@entry=1000) at runas.c:1033
5  0x000055fc36efc9ce in run_as_unlink (path=path@entry=0x7efc5b7fb690 
"/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index/channel0_3.idx",
 uid=uid@entry=1000, gid=gid@entry=1000) at runas.c :1120
6  0x000055fc36ef7feb in utils_unlink_stream_file 
(path_name=path_name@entry=0x7efc5b7fc7e0 
"/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel/index",
 file_name=file_name@entry=0x7efc500085d4 "channel0_3", size=size@entry=0, 
count=count@entry=0, uid=uid@entry=1000, gid=gid@entry=1000, 
suffix=0x55fc36f19b26 ".idx") at utils.c:929
7  0x000055fc36f01d4e in lttng_index_file_create 
(path_name=path_name@entry=0x7efc500087a0 
"/home/joraj/lttng-traces/auto-20190116-111518/20190116T111729-0500-33/kernel", 
stream_name=stream_name@entry=0x7efc500085d4 "channel0_3", uid=1000, gid=1000, 
size=0, count=0, major=1, minor=1) at index.c:79
8  0x000055fc36ed9475 in rotate_local_stream (ctx=<optimized out>, 
stream=0x7efc50008460) at consumer.c:4105
9  0x000055fc36ed98b5 in lttng_consumer_rotate_stream 
(ctx=ctx@entry=0x55fc37428d80, stream=stream@entry=0x7efc50008460, 
rotated=rotated@entry=0x7efc5b7fdb27) at consumer.c:4181
10 0x000055fc36ee354e in lttng_kconsumer_read_subbuffer 
(stream=stream@entry=0x7efc50008460, ctx=ctx@entry=0x55fc37428d80, 
rotated=rotated@entry=0x7efc5b7fdb27) at kernel-consumer.c:1740
11 0x000055fc36ed7a30 in lttng_consumer_read_subbuffer (stream=0x7efc50008460, 
ctx=0x55fc37428d80) at consumer.c:3383
12 0x000055fc36ed4b74 in consumer_thread_data_poll (data=0x55fc37428d80) at 
consumer.c:2751
13 0x00007efc6b64d6db in start_thread (arg=0x7efc5b7fe700) at 
pthread_create.c:463
14 0x00007efc6af6488f in clone () at 
../sysdeps/unix/sysv/linux/x86_64/clone.S:95

The owner of the lock is itself:
  print worker_lock.__data.__owner
    $2 = 25725
  thread find 25725
    Thread 5 has target id 'Thread 0x7efc5b7fe700 (LWP 25725)'

The worker_lock is first taken in frame #4: run_as runas.c:1033

  pthread_mutex_lock(&worker_lock);
  if (use_clone()) {
...
    /*
     * If the worker thread crashed the errno is set to EIO. we log
     * the error and  start a new worker process.
     */
    if (ret == -1 && saved_errno == EIO) {
        DBG("Socket closed unexpectedly... "
        "Restarting the worker process");
->      ret = run_as_restart_worker(global_worker);
        if (ret == -1) {
          ERR("Failed to restart worker process.");
          goto err;
        }

Solution
========

Create run_as_restart_worker_no_lock which does not to take the lock on
execution.
Use run_as_restart_worker_no_lock at the run_as error path call site.
Use run_as_restart_worker_no_lock inside run_as_restart_worker while
holding the worker lock to provide identical behaviour to other call sites.

Signed-off-by: Jonathan Rajotte <jonathan.rajotte-jul...@efficios.com>
---
 src/common/runas.c | 89 ++++++++++++++++++++++++----------------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/src/common/runas.c b/src/common/runas.c
index d727cf417..9e6277078 100644
--- a/src/common/runas.c
+++ b/src/common/runas.c
@@ -986,6 +986,51 @@ error_procname_alloc:
        return ret;
 }
 
+static
+void run_as_destroy_worker_no_lock(void)
+{
+       struct run_as_worker *worker = global_worker;
+
+       DBG("Destroying run_as worker");
+       if (!worker) {
+               return;
+       }
+       /* Close unix socket */
+       DBG("Closing run_as worker socket");
+       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
+               PERROR("close");
+       }
+       worker->sockpair[0] = -1;
+       /* Wait for worker. */
+       for (;;) {
+               int status;
+               pid_t wait_ret;
+
+               wait_ret = waitpid(worker->pid, &status, 0);
+               if (wait_ret < 0) {
+                       if (errno == EINTR) {
+                               continue;
+                       }
+                       PERROR("waitpid");
+                       break;
+               }
+
+               if (WIFEXITED(status)) {
+                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
+                                       DEFAULT_RUN_AS_WORKER_NAME " terminated 
with status code %d",
+                                       WEXITSTATUS(status));
+                       break;
+               } else if (WIFSIGNALED(status)) {
+                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal 
%d",
+                                       WTERMSIG(status));
+                       break;
+               }
+       }
+       free(worker->procname);
+       free(worker);
+       global_worker = NULL;
+}
+
 static
 int run_as_restart_worker(struct run_as_worker *worker)
 {
@@ -995,7 +1040,7 @@ int run_as_restart_worker(struct run_as_worker *worker)
        procname = worker->procname;
 
        /* Close socket to run_as worker process and clean up the zombie 
process */
-       run_as_destroy_worker();
+       run_as_destroy_worker_no_lock();
 
        /* Create a new run_as worker process*/
        ret = run_as_create_worker_no_lock(procname);
@@ -1227,47 +1272,7 @@ int run_as_create_worker(const char *procname)
 LTTNG_HIDDEN
 void run_as_destroy_worker(void)
 {
-       struct run_as_worker *worker = global_worker;
-
-       DBG("Destroying run_as worker");
        pthread_mutex_lock(&worker_lock);
-       if (!worker) {
-               goto end;
-       }
-       /* Close unix socket */
-       DBG("Closing run_as worker socket");
-       if (lttcomm_close_unix_sock(worker->sockpair[0])) {
-               PERROR("close");
-       }
-       worker->sockpair[0] = -1;
-       /* Wait for worker. */
-       for (;;) {
-               int status;
-               pid_t wait_ret;
-
-               wait_ret = waitpid(worker->pid, &status, 0);
-               if (wait_ret < 0) {
-                       if (errno == EINTR) {
-                               continue;
-                       }
-                       PERROR("waitpid");
-                       break;
-               }
-
-               if (WIFEXITED(status)) {
-                       LOG(WEXITSTATUS(status) == 0 ? PRINT_DBG : PRINT_ERR,
-                                       DEFAULT_RUN_AS_WORKER_NAME " terminated 
with status code %d",
-                                       WEXITSTATUS(status));
-                       break;
-               } else if (WIFSIGNALED(status)) {
-                       ERR(DEFAULT_RUN_AS_WORKER_NAME " was killed by signal 
%d",
-                                       WTERMSIG(status));
-                       break;
-               }
-       }
-       free(worker->procname);
-       free(worker);
-       global_worker = NULL;
-end:
+       run_as_destroy_worker_no_lock();
        pthread_mutex_unlock(&worker_lock);
 }
-- 
2.17.1

_______________________________________________
lttng-dev mailing list
lttng-dev@lists.lttng.org
https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev

Reply via email to