Jakub, this is the inconsistency you pointed out while we were analyzing the scheduling circular lists.

The problem in gomp_task_run_pre() is that, upon setting an upcoming running task to TIED, we may leave either the sibling or taskgroup queues in an indeterminate state. We may have upcoming WAITING tasks which means we may end up with a queue that looks like:

                            child_task
                                |
                                |
                                V
        WAITING -> WAITING -> TIED -> WAITING

Where TIED was the WAITING child_task upon entry, but will become TIED. Having a WAITING task following a TIED task violates our assumptions for the sibling and taskgroup queues.

How this all worked is beyond me, since all the various scheduling points in task.c are picking from the top of their respective queues. What happens if any of the scheduling points come to a TIED task? Who will pick up the remaining WAITING tasks that are incorrectly living past the TIED task?

However magically this happened before :), my attached changes to gomp_task_run_pre() will move this task to the end of the queue such that the WAITING tasks are first.

To find possible culprits I wrote various verification routines, which I think will be of use going forward. They are guarded by _LIBGOMP_CHECKING_ which will be optimized away if set to 0.

I have tested this patch on x86-64 Linux with _LIBGOMP_CHECKING_ of 1 with and without my rewiring changes, for OMP_NUM_THREADS of 1,2,4,5,16,56 (56 being my testing box's maximum number of threads). The verification routines found an actual failure in the taskgroup queues which this patch definitely fixes.

I also found that GOMP_taskgroup_end() passes a taskgroup to gomp_task_run_pre() that may or may not be relevant to the child_task at hand, because GOMP_taskgroup_end() may be picking the next task from either the sibling queue or the taskgroup queue. If the task is chosen from the sibling queue, it's because we have no WAITING tasks in the taskgroup, so IMO we shouldn't be passing around a taskgroup that doesn't match. This is also fixed.

Many of this will become clearer/simpler with my upcoming API changes. When priorities are implemented, I will adjust these verification routines to work with the circular lists *and* with the AVL/whatever trees.

OK for branch?
commit bb528e49ee4168bc682c5c76f961c8a41c30a04d
Author: Aldy Hernandez <al...@redhat.com>
Date:   Sun Oct 4 09:52:49 2015 -0700

        * libgomp.h (_LIBGOMP_CHECKING_): New macro.
        * task.c (verify_children_queue): New.
        (verify_taskgroup_queue): New.
        (verify_task_queue): New.
        (gomp_task_run_pre): Call verify_*_queue functions.
        If an upcoming tied task is about to leave the sibling or
        taskgroup queues in an invalid state, adjust appropriately.
        (GOMP_taskgroup_end): Do not pass taskgroup to gomp_task_run_pre().

diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 70b4e9f..a074ae7 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -36,6 +36,11 @@
 #ifndef LIBGOMP_H 
 #define LIBGOMP_H 1
 
+#ifndef _LIBGOMP_CHECKING_
+/* Define to 1 to perform internal sanity checks.  */
+#define _LIBGOMP_CHECKING_ 0
+#endif
+
 #include "config.h"
 #include "gstdint.h"
 #include "libgomp-plugin.h"
diff --git a/libgomp/task.c b/libgomp/task.c
index f6a67eb..6ac910e 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -27,6 +27,7 @@
    creation and termination.  */
 
 #include "libgomp.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include "gomp-constants.h"
@@ -567,10 +568,151 @@ gomp_create_target_task (struct gomp_device_descr 
*devicep,
     gomp_team_barrier_wake (&team->barrier, 1);
 }
 
+/* Sanity check TASK to make sure it is in its parent's children
+   queue, and that the tasks therein are in the right order.
+
+   The expected order is:
+       parent_depends_on WAITING tasks
+       !parent_depends_on WAITING tasks
+       TIED tasks
+
+   PARENT is the alleged parent of TASK.  */
+
+static void
+verify_children_queue (struct gomp_task *task, struct gomp_task *parent)
+{
+  if (task->parent != parent)
+    {
+      fprintf (stderr, "verify_children_queue: incompatible parents\n");
+      abort ();
+    }
+  /* It's OK, Annie was an orphan and she turned out all right.  */
+  if (!parent)
+    return;
+
+  bool seen_tied = false;
+  bool seen_plain_waiting = false;
+  bool found = false;
+  struct gomp_task *t = parent->children;
+  while (1)
+    {
+      if (t == task)
+       found = true;
+      if (seen_tied && t->kind == GOMP_TASK_WAITING)
+       {
+         fprintf (stderr,
+                  "verify_children_queue: WAITING task after TIED.");
+         abort ();
+       }
+      if (t->kind == GOMP_TASK_TIED)
+       seen_tied = true;
+      else if (t->kind == GOMP_TASK_WAITING)
+       {
+         if (t->parent_depends_on)
+           {
+             if (seen_plain_waiting)
+               {
+                 fprintf (stderr,
+                          "verify_children_queue: parent_depends_on after "
+                          "!parent_depends_on\n");
+                 abort ();
+               }
+           }
+         else
+           seen_plain_waiting = true;
+       }
+      t = t->next_child;
+      if (t == parent->children)
+       break;
+    }
+  if (!found)
+    {
+      fprintf (stderr,
+              "verify_children_queue: child not found in parent queue\n");
+      abort ();
+    }
+}
+
+/* Sanity check TASK to make sure it is in its taskgroup queue (if
+   applicable), and that the tasks therein are in the right order.
+
+   The expected order is that GOMP_TASK_WAITING tasks must come before
+   GOMP_TASK_TIED tasks.
+
+   TASK is the task.  TASKGROUP is the alleged taskgroup that contains
+   TASK.  */
+
+static void
+verify_taskgroup_queue (struct gomp_task *task,
+                       struct gomp_taskgroup *taskgroup)
+{
+  if (taskgroup != task->taskgroup)
+    {
+      fprintf (stderr, "verify_taskgroup_queue: incompatible taskgroups\n");
+      fprintf (stderr, "%p %p\n", task->taskgroup, taskgroup);
+      abort ();
+    }
+  if (!taskgroup)
+    return;
+
+  bool seen_tied = false;
+  bool found = false;
+  struct gomp_task *t = taskgroup->children;
+  while (1)
+    {
+      if (t == task)
+       found = true;
+      if (t->kind == GOMP_TASK_WAITING && seen_tied)
+       {
+         fprintf (stderr,
+                  "verify_taskgroup_queue: WAITING task after TIED.\n");
+         abort ();
+       }
+      if (t->kind == GOMP_TASK_TIED)
+       seen_tied = true;
+      t = t->next_taskgroup;
+      if (t == taskgroup->children)
+       break;
+    }
+  if (!found)
+    {
+      fprintf (stderr,
+              "verify_taskgroup_queue: child not found in parent queue\n");
+      abort ();
+    }
+}
+
+/* Verify that TASK is in the team's task queue.  */
+
+static void
+verify_task_queue (struct gomp_task *task, struct gomp_team *team)
+{
+  struct gomp_task *t = team->task_queue;
+  if (team)
+    while (1)
+      {
+       if (t == task)
+         return;
+       t = t->next_queue;
+       if (t == team->task_queue)
+         break;
+      }
+  fprintf (stderr, "verify_team_queue: child not in team\n");
+  abort ();
+}
+
 static inline bool
 gomp_task_run_pre (struct gomp_task *child_task, struct gomp_task *parent,
                   struct gomp_taskgroup *taskgroup, struct gomp_team *team)
 {
+  if (_LIBGOMP_CHECKING_)
+    {
+      verify_children_queue (child_task, parent);
+      verify_taskgroup_queue (child_task,
+                             taskgroup ? taskgroup : child_task->taskgroup);
+      verify_task_queue (child_task, team);
+    }
+
   if (parent)
     {
       /* Adjust children such that it will point to a next child,
@@ -583,6 +725,21 @@ gomp_task_run_pre (struct gomp_task *child_task, struct 
gomp_task *parent,
         by GOMP_taskwait).  */
       if (parent->children == child_task)
        parent->children = child_task->next_child;
+      /* TIED tasks cannot come before WAITING tasks.  If we're about
+        to make this task TIED, rewire things appropriately.
+        However, a TIED task at the end is perfectly fine.  */
+      else if (child_task->next_child->kind == GOMP_TASK_WAITING
+              && child_task->next_child != parent->children)
+       {
+         /* Remove from the list.  */
+         child_task->prev_child->next_child = child_task->next_child;
+         child_task->next_child->prev_child = child_task->prev_child;
+         /* Rewire at the end of its siblings.  */
+         child_task->next_child = parent->children;
+         child_task->prev_child = parent->children->prev_child;
+         parent->children->prev_child->next_child = child_task;
+         parent->children->prev_child = child_task;
+       }
 
       /* If the current task (child_task) is at the top of the
         parent's last_parent_depends_on, it's about to be removed
@@ -610,8 +767,28 @@ gomp_task_run_pre (struct gomp_task *child_task, struct 
gomp_task *parent,
   /* Adjust taskgroup to point to the next taskgroup.  See note above
      regarding adjustment of children as to why the child_task is not
      removed entirely from the circular list.  */
-  if (taskgroup && taskgroup->children == child_task)
-    taskgroup->children = child_task->next_taskgroup;
+  if (taskgroup)
+    {
+      if (taskgroup->children == child_task)
+       taskgroup->children = child_task->next_taskgroup;
+      /* TIED tasks cannot come before WAITING tasks.  If we're about
+        to make this task TIED, rewire things appropriately.
+        However, a TIED task at the end is perfectly fine.  */
+      else if (child_task->next_taskgroup->kind == GOMP_TASK_WAITING
+              && child_task->next_taskgroup != taskgroup->children)
+       {
+         /* Remove from the list.  */
+         child_task->prev_taskgroup->next_taskgroup
+           = child_task->next_taskgroup;
+         child_task->next_taskgroup->prev_taskgroup
+           = child_task->prev_taskgroup;
+         /* Rewire at the end of its taskgroup.  */
+         child_task->next_taskgroup = taskgroup->children;
+         child_task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+         taskgroup->children->prev_taskgroup->next_taskgroup = child_task;
+         taskgroup->children->prev_taskgroup = child_task;
+       }
+    }
 
   /* Remove child_task from the task_queue.  */
   child_task->prev_queue->next_queue = child_task->next_queue;
@@ -1339,6 +1516,7 @@ GOMP_taskgroup_end (void)
     goto finish;
 
   gomp_mutex_lock (&team->task_lock);
+  bool task_from_taskgroup = true;
   while (1)
     {
       bool cancelled = false;
@@ -1349,6 +1527,7 @@ GOMP_taskgroup_end (void)
              if (task->children == NULL)
                goto do_wait;
              child_task = task->children;
+             task_from_taskgroup = false;
             }
           else
            {
@@ -1362,11 +1541,20 @@ GOMP_taskgroup_end (void)
            }
        }
       else
-       child_task = taskgroup->children;
+       {
+         child_task = taskgroup->children;
+         task_from_taskgroup = true;
+       }
       if (child_task->kind == GOMP_TASK_WAITING)
        {
+         /* ?? Since child_task can come from either
+            taskgroup->children or from task->children, child_task
+            does not necessarily live in `taskgroup' below.  Perhaps
+            in this case we should pass NULL to taskgroup, to
+            indicate child_task does not live in `taskgroup'?  */
          cancelled
-           = gomp_task_run_pre (child_task, child_task->parent, taskgroup,
+           = gomp_task_run_pre (child_task, child_task->parent,
+                                task_from_taskgroup ? taskgroup : NULL,
                                 team);
          if (__builtin_expect (cancelled, 0))
            {

Reply via email to