Re: [gomp4] Library side of depend clause support

Jakub Jelinek Fri, 27 Sep 2013 03:08:56 -0700

On Fri, Sep 27, 2013 at 01:48:36AM +0200, Jakub Jelinek wrote:
> Perhaps.  What if I do just minor cleanup (use flexible array members for
> the reallocated vectors, and perhaps keep only the last out/inout task
> in the hash table chains rather than all of them), retest, commit and then
> we can discuss/incrementally improve it?


Here is what I've committed now, the incremental changes were really only
using a structure with flex array member for the dependers vectors,
removing/making redundant earlier !ent->is_in when adding !is_in into the
chain and addition of new testcases.

Let's improve it incrementally later.

2013-09-27  Jakub Jelinek  <ja...@redhat.com>

        * libgomp.h: Include stdlib.h.
        (struct gomp_task_depend_entry,
        struct gomp_dependers_vec): New types.
        (struct gomp_task): Add dependers, depend_hash, depend_count,
        num_dependees and depend fields.
        (struct gomp_taskgroup): Add num_children field.
        (gomp_finish_task): Free depend_hash if non-NULL.
        * libgomp_g.h (GOMP_task): Add depend argument.
        * hashtab.h: New file.
        * task.c: Include hashtab.h.
        (hash_entry_type): New typedef.
        (htab_alloc, htab_free, htab_hash, htab_eq): New inlines.
        (gomp_init_task): Clear dependers, depend_hash and depend_count
        fields.
        (GOMP_task): Add depend argument, handle depend clauses.  Increment
        num_children field in taskgroup.
        (gomp_task_run_pre): Don't increment task_running_count here,
        nor clear task_pending bit.
        (gomp_task_run_post_handle_depend_hash,
        gomp_task_run_post_handle_dependers,
        gomp_task_run_post_handle_depend): New functions.
        (gomp_task_run_post_remove_parent): Clear in_taskwait before
        signalling corresponding semaphore.
        (gomp_task_run_post_remove_taskgroup): Decrement num_children
        field and make the decrement to 0 MEMMODEL_RELEASE operation,
        rather than storing NULL to taskgroup->children.  Clear
        in_taskgroup_wait before signalling corresponding semaphore.
        (gomp_barrier_handle_tasks): Move task_running_count increment
        and task_pending bit clearing here.  Call
        gomp_task_run_post_handle_depend.  If more than one new tasks
        have been queued, wake other threads if needed.
        (GOMP_taskwait): Call gomp_task_run_post_handle_depend.  If more
        than one new tasks have been queued, wake other threads if needed.
        After waiting on taskwait_sem, enter critical section again.
        (GOMP_taskgroup_start): Initialize num_children field.
        (GOMP_taskgroup_end): Check num_children instead of children
        before critical section.  If children is NULL, but num_children
        is non-zero, wait on taskgroup_sem.  Call
        gomp_task_run_post_handle_depend.  If more than one new tasks have
        been queued, wake other threads if needed.  After waiting on
        taskgroup_sem, enter critical section again.
        * testsuite/libgomp.c/depend-1.c: New test.
        * testsuite/libgomp.c/depend-2.c: New test.
        * testsuite/libgomp.c/depend-3.c: New test.
        * testsuite/libgomp.c/depend-4.c: New test.

--- libgomp/libgomp.h.jj        2013-09-26 09:43:10.903930832 +0200
+++ libgomp/libgomp.h   2013-09-27 09:05:17.025402127 +0200
@@ -39,6 +39,7 @@
 
 #include <pthread.h>
 #include <stdbool.h>
+#include <stdlib.h>
 
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility push(hidden)
@@ -253,7 +254,26 @@ enum gomp_task_kind
   GOMP_TASK_TIED
 };
 
+struct gomp_task;
 struct gomp_taskgroup;
+struct htab;
+
+struct gomp_task_depend_entry
+{
+  void *addr;
+  struct gomp_task_depend_entry *next;
+  struct gomp_task_depend_entry *prev;
+  struct gomp_task *task;
+  bool is_in;
+  bool redundant;
+};
+
+struct gomp_dependers_vec
+{
+  size_t n_elem;
+  size_t allocated;
+  struct gomp_task *elem[];
+};
 
 /* This structure describes a "task" to be run by a thread.  */
 
@@ -268,6 +288,10 @@ struct gomp_task
   struct gomp_task *next_taskgroup;
   struct gomp_task *prev_taskgroup;
   struct gomp_taskgroup *taskgroup;
+  struct gomp_dependers_vec *dependers;
+  struct htab *depend_hash;
+  size_t depend_count;
+  size_t num_dependees;
   struct gomp_task_icv icv;
   void (*fn) (void *);
   void *fn_data;
@@ -277,6 +301,7 @@ struct gomp_task
   bool final_task;
   bool copy_ctors_done;
   gomp_sem_t taskwait_sem;
+  struct gomp_task_depend_entry depend[];
 };
 
 struct gomp_taskgroup
@@ -286,6 +311,7 @@ struct gomp_taskgroup
   bool in_taskgroup_wait;
   bool cancelled;
   gomp_sem_t taskgroup_sem;
+  size_t num_children;
 };
 
 /* This structure describes a "team" of threads.  These are the threads
@@ -525,6 +551,8 @@ extern void gomp_barrier_handle_tasks (g
 static void inline
 gomp_finish_task (struct gomp_task *task)
 {
+  if (__builtin_expect (task->depend_hash != NULL, 0))
+    free (task->depend_hash);
   gomp_sem_destroy (&task->taskwait_sem);
 }
 
--- libgomp/libgomp_g.h.jj      2013-09-26 09:43:10.902930838 +0200
+++ libgomp/libgomp_g.h 2013-09-26 10:08:44.820160094 +0200
@@ -178,7 +178,7 @@ extern bool GOMP_cancellation_point (int
 /* task.c */
 
 extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *),
-                      long, long, bool, unsigned);
+                      long, long, bool, unsigned, void **);
 extern void GOMP_taskwait (void);
 extern void GOMP_taskyield (void);
 extern void GOMP_taskgroup_start (void);
--- libgomp/hashtab.h.jj        2013-09-26 10:08:51.031128932 +0200
+++ libgomp/hashtab.h   2013-09-26 21:07:17.757697391 +0200
@@ -0,0 +1,443 @@
+/* An expandable hash tables datatype.
+   Copyright (C) 1999-2013
+   Free Software Foundation, Inc.
+   Contributed by Vladimir Makarov <vmaka...@cygnus.com>.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, 
USA.  */
+
+/* The hash table code copied from include/hashtab.[hc] and adjusted,
+   so that the hash table entries are in the flexible array at the end
+   of the control structure, no callbacks are used and the elements in the
+   table are of the hash_entry_type type.
+   Before including this file, define hash_entry_type type and
+   htab_alloc and htab_free functions.  After including it, define
+   htab_hash and htab_eq inline functions.   */
+
+/* This package implements basic hash table functionality.  It is possible
+   to search for an entry, create an entry and destroy an entry.
+
+   Elements in the table are generic pointers.
+
+   The size of the table is not fixed; if the occupancy of the table
+   grows too high the hash table will be expanded.
+
+   The abstract data implementation is based on generalized Algorithm D
+   from Knuth's book "The art of computer programming".  Hash table is
+   expanded by creation of new hash table and transferring elements from
+   the old table to the new table.  */
+
+/* The type for a hash code.  */
+typedef unsigned int hashval_t;
+
+static inline hashval_t htab_hash (hash_entry_type);
+static inline bool htab_eq (hash_entry_type, hash_entry_type);
+
+/* This macro defines reserved value for empty table entry.  */
+
+#define HTAB_EMPTY_ENTRY    ((hash_entry_type) 0)
+
+/* This macro defines reserved value for table entry which contained
+   a deleted element. */
+
+#define HTAB_DELETED_ENTRY  ((hash_entry_type) 1)
+
+/* Hash tables are of the following type.  The structure
+   (implementation) of this type is not needed for using the hash
+   tables.  All work with hash table should be executed only through
+   functions mentioned below.  The size of this structure is subject to
+   change.  */
+
+struct htab {
+  /* Current size (in entries) of the hash table.  */
+  size_t size;
+
+  /* Current number of elements including also deleted elements.  */
+  size_t n_elements;
+
+  /* Current number of deleted elements in the table.  */
+  size_t n_deleted;
+
+  /* Current size (in entries) of the hash table, as an index into the
+     table of primes.  */
+  unsigned int size_prime_index;
+
+  /* Table itself.  */
+  hash_entry_type entries[];
+};
+
+typedef struct htab *htab_t;
+
+/* An enum saying whether we insert into the hash table or not.  */
+enum insert_option {NO_INSERT, INSERT};
+
+/* Table of primes and multiplicative inverses.
+
+   Note that these are not minimally reduced inverses.  Unlike when generating
+   code to divide by a constant, we want to be able to use the same algorithm
+   all the time.  All of these inverses (are implied to) have bit 32 set.
+
+   For the record, the function that computed the table is in
+   libiberty/hashtab.c.  */
+
+struct prime_ent
+{
+  hashval_t prime;
+  hashval_t inv;
+  hashval_t inv_m2;    /* inverse of prime-2 */
+  hashval_t shift;
+};
+
+static struct prime_ent const prime_tab[] = {
+  {          7, 0x24924925, 0x9999999b, 2 },
+  {         13, 0x3b13b13c, 0x745d1747, 3 },
+  {         31, 0x08421085, 0x1a7b9612, 4 },
+  {         61, 0x0c9714fc, 0x15b1e5f8, 5 },
+  {        127, 0x02040811, 0x0624dd30, 6 },
+  {        251, 0x05197f7e, 0x073260a5, 7 },
+  {        509, 0x01824366, 0x02864fc8, 8 },
+  {       1021, 0x00c0906d, 0x014191f7, 9 },
+  {       2039, 0x0121456f, 0x0161e69e, 10 },
+  {       4093, 0x00300902, 0x00501908, 11 },
+  {       8191, 0x00080041, 0x00180241, 12 },
+  {      16381, 0x000c0091, 0x00140191, 13 },
+  {      32749, 0x002605a5, 0x002a06e6, 14 },
+  {      65521, 0x000f00e2, 0x00110122, 15 },
+  {     131071, 0x00008001, 0x00018003, 16 },
+  {     262139, 0x00014002, 0x0001c004, 17 },
+  {     524287, 0x00002001, 0x00006001, 18 },
+  {    1048573, 0x00003001, 0x00005001, 19 },
+  {    2097143, 0x00004801, 0x00005801, 20 },
+  {    4194301, 0x00000c01, 0x00001401, 21 },
+  {    8388593, 0x00001e01, 0x00002201, 22 },
+  {   16777213, 0x00000301, 0x00000501, 23 },
+  {   33554393, 0x00001381, 0x00001481, 24 },
+  {   67108859, 0x00000141, 0x000001c1, 25 },
+  {  134217689, 0x000004e1, 0x00000521, 26 },
+  {  268435399, 0x00000391, 0x000003b1, 27 },
+  {  536870909, 0x00000019, 0x00000029, 28 },
+  { 1073741789, 0x0000008d, 0x00000095, 29 },
+  { 2147483647, 0x00000003, 0x00000007, 30 },
+  /* Avoid "decimal constant so large it is unsigned" for 4294967291.  */
+  { 0xfffffffb, 0x00000006, 0x00000008, 31 }
+};
+
+/* The following function returns an index into the above table of the
+   nearest prime number which is greater than N, and near a power of two. */
+
+static unsigned int
+higher_prime_index (unsigned long n)
+{
+  unsigned int low = 0;
+  unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]);
+
+  while (low != high)
+    {
+      unsigned int mid = low + (high - low) / 2;
+      if (n > prime_tab[mid].prime)
+       low = mid + 1;
+      else
+       high = mid;
+    }
+
+  /* If we've run out of primes, abort.  */
+  if (n > prime_tab[low].prime)
+    abort ();
+
+  return low;
+}
+
+/* Return the current size of given hash table.  */
+
+static inline size_t
+htab_size (htab_t htab)
+{
+  return htab->size;
+}
+
+/* Return the current number of elements in given hash table. */
+
+static inline size_t
+htab_elements (htab_t htab)
+{
+  return htab->n_elements - htab->n_deleted;
+}
+
+/* Return X % Y.  */
+
+static inline hashval_t
+htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift)
+{
+  /* The multiplicative inverses computed above are for 32-bit types, and
+     requires that we be able to compute a highpart multiply.  */
+  if (sizeof (hashval_t) * __CHAR_BIT__ <= 32)
+    {
+      hashval_t t1, t2, t3, t4, q, r;
+
+      t1 = ((unsigned long long)x * inv) >> 32;
+      t2 = x - t1;
+      t3 = t2 >> 1;
+      t4 = t1 + t3;
+      q  = t4 >> shift;
+      r  = x - (q * y);
+
+      return r;
+    }
+
+  /* Otherwise just use the native division routines.  */
+  return x % y;
+}
+
+/* Compute the primary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return htab_mod_1 (hash, p->prime, p->inv, p->shift);
+}
+
+/* Compute the secondary hash for HASH given HTAB's current size.  */
+
+static inline hashval_t
+htab_mod_m2 (hashval_t hash, htab_t htab)
+{
+  const struct prime_ent *p = &prime_tab[htab->size_prime_index];
+  return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift);
+}
+
+/* Create hash table of size SIZE.  */
+
+static htab_t
+htab_create (size_t size)
+{
+  htab_t result;
+  unsigned int size_prime_index;
+
+  size_prime_index = higher_prime_index (size);
+  size = prime_tab[size_prime_index].prime;
+
+  result = (htab_t) htab_alloc (sizeof (struct htab)
+                               + size * sizeof (hash_entry_type));
+  result->size = size;
+  result->n_elements = 0;
+  result->n_deleted = 0;
+  result->size_prime_index = size_prime_index;
+  memset (result->entries, 0, size * sizeof (hash_entry_type));
+  return result;
+}
+
+/* Similar to htab_find_slot, but without several unwanted side effects:
+    - Does not call htab_eq when it finds an existing entry.
+    - Does not change the count of elements in the hash table.
+   This function also assumes there are no deleted entries in the table.
+   HASH is the hash value for the element to be inserted.  */
+
+static hash_entry_type *
+find_empty_slot_for_expand (htab_t htab, hashval_t hash)
+{
+  hashval_t index = htab_mod (hash, htab);
+  size_t size = htab_size (htab);
+  hash_entry_type *slot = htab->entries + index;
+  hashval_t hash2;
+
+  if (*slot == HTAB_EMPTY_ENTRY)
+    return slot;
+  else if (*slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+       index -= size;
+
+      slot = htab->entries + index;
+      if (*slot == HTAB_EMPTY_ENTRY)
+       return slot;
+      else if (*slot == HTAB_DELETED_ENTRY)
+       abort ();
+    }
+}
+
+/* The following function changes size of memory allocated for the
+   entries and repeatedly inserts the table elements.  The occupancy
+   of the table after the call will be about 50%.  Naturally the hash
+   table must already exist.  Remember also that the place of the
+   table entries is changed.  */
+
+static htab_t
+htab_expand (htab_t htab)
+{
+  htab_t nhtab;
+  hash_entry_type *olimit;
+  hash_entry_type *p;
+  size_t osize, elts;
+
+  osize = htab->size;
+  olimit = htab->entries + osize;
+  elts = htab_elements (htab);
+
+  /* Resize only when table after removal of unused elements is either
+     too full or too empty.  */
+  if (elts * 2 > osize || (elts * 8 < osize && osize > 32))
+    nhtab = htab_create (elts * 2);
+  else
+    nhtab = htab_create (osize - 1);
+  nhtab->n_elements = htab->n_elements - htab->n_deleted;
+
+  p = htab->entries;
+  do
+    {
+      hash_entry_type x = *p;
+
+      if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY)
+       *find_empty_slot_for_expand (nhtab, htab_hash (x)) = x;
+
+      p++;
+    }
+  while (p < olimit);
+
+  htab_free (htab);
+  return nhtab;
+}
+
+/* This function searches for a hash table entry equal to the given
+   element.  It cannot be used to insert or delete an element.  */
+
+static hash_entry_type
+htab_find (htab_t htab, const hash_entry_type element)
+{
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+
+  size = htab_size (htab);
+  index = htab_mod (hash, htab);
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY
+      || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+    return entry;
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+       index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY
+         || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element)))
+       return entry;
+    }
+}
+
+/* This function searches for a hash table slot containing an entry
+   equal to the given element.  To delete an entry, call this with
+   insert=NO_INSERT, then call htab_clear_slot on the slot returned
+   (possibly after doing some checks).  To insert an entry, call this
+   with insert=INSERT, then write the value you want into the returned
+   slot.  */
+
+static hash_entry_type *
+htab_find_slot (htab_t *htabp, const hash_entry_type element,
+               enum insert_option insert)
+{
+  hash_entry_type *first_deleted_slot;
+  hashval_t index, hash2, hash = htab_hash (element);
+  size_t size;
+  hash_entry_type entry;
+  htab_t htab = *htabp;
+
+  size = htab_size (htab);
+  if (insert == INSERT && size * 3 <= htab->n_elements * 4)
+    {
+      htab = *htabp = htab_expand (htab);
+      size = htab_size (htab);
+    }
+
+  index = htab_mod (hash, htab);
+
+  first_deleted_slot = NULL;
+
+  entry = htab->entries[index];
+  if (entry == HTAB_EMPTY_ENTRY)
+    goto empty_entry;
+  else if (entry == HTAB_DELETED_ENTRY)
+    first_deleted_slot = &htab->entries[index];
+  else if (htab_eq (entry, element))
+    return &htab->entries[index];
+
+  hash2 = htab_mod_m2 (hash, htab);
+  for (;;)
+    {
+      index += hash2;
+      if (index >= size)
+       index -= size;
+
+      entry = htab->entries[index];
+      if (entry == HTAB_EMPTY_ENTRY)
+       goto empty_entry;
+      else if (entry == HTAB_DELETED_ENTRY)
+       {
+         if (!first_deleted_slot)
+           first_deleted_slot = &htab->entries[index];
+       }
+      else if (htab_eq (entry, element))
+       return &htab->entries[index];
+    }
+
+ empty_entry:
+  if (insert == NO_INSERT)
+    return NULL;
+
+  if (first_deleted_slot)
+    {
+      htab->n_deleted--;
+      *first_deleted_slot = HTAB_EMPTY_ENTRY;
+      return first_deleted_slot;
+    }
+
+  htab->n_elements++;
+  return &htab->entries[index];
+}
+
+/* This function clears a specified slot in a hash table.  It is
+   useful when you've already done the lookup and don't want to do it
+   again.  */
+
+static inline void
+htab_clear_slot (htab_t htab, hash_entry_type *slot)
+{
+  if (slot < htab->entries || slot >= htab->entries + htab_size (htab)
+      || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY)
+    abort ();
+
+  *slot = HTAB_DELETED_ENTRY;
+  htab->n_deleted++;
+}
+
+/* Returns a hash code for pointer P. Simplified version of evahash */
+
+static inline hashval_t
+hash_pointer (const void *p)
+{
+  uintptr_t v = (uintptr_t) p;
+  if (sizeof (v) > sizeof (hashval_t))
+    v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__);
+  return v;
+}
--- libgomp/task.c.jj   2013-09-26 09:43:10.903930832 +0200
+++ libgomp/task.c      2013-09-27 09:30:57.798187840 +0200
@@ -29,6 +29,33 @@
 #include <stdlib.h>
 #include <string.h>
 
+typedef struct gomp_task_depend_entry *hash_entry_type;
+
+static inline void *
+htab_alloc (size_t size)
+{
+  return gomp_malloc (size);
+}
+
+static inline void
+htab_free (void *ptr)
+{
+  free (ptr);
+}
+
+#include "hashtab.h"
+
+static inline hashval_t
+htab_hash (hash_entry_type element)
+{
+  return hash_pointer (element->addr);
+}
+
+static inline bool
+htab_eq (hash_entry_type x, hash_entry_type y)
+{
+  return x->addr == y->addr;
+}
 
 /* Create a new task data structure.  */
 
@@ -45,6 +72,9 @@ gomp_init_task (struct gomp_task *task,
   task->copy_ctors_done = false;
   task->children = NULL;
   task->taskgroup = NULL;
+  task->dependers = NULL;
+  task->depend_hash = NULL;
+  task->depend_count = 0;
   gomp_sem_init (&task->taskwait_sem, 0);
 }
 
@@ -80,7 +110,8 @@ gomp_clear_parent (struct gomp_task *chi
 
 void
 GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *),
-          long arg_size, long arg_align, bool if_clause, unsigned flags)
+          long arg_size, long arg_align, bool if_clause, unsigned flags,
+          void **depend)
 {
   struct gomp_thread *thr = gomp_thread ();
   struct gomp_team *team = thr->ts.team;
@@ -108,6 +139,38 @@ GOMP_task (void (*fn) (void *), void *da
     {
       struct gomp_task task;
 
+      /* If there are depend clauses and earlier deferred sibling tasks
+        with depend clauses, check if there isn't a dependency.  If there
+        is, fall through to the deferred task handling, as we can't
+        schedule such tasks right away.  There is no need to handle
+        depend clauses for non-deferred tasks other than this, because
+        the parent task is suspended until the child task finishes and thus
+        it can't start further child tasks.  */
+      if ((flags & 8) && thr->task && thr->task->depend_hash)
+       {
+         struct gomp_task *parent = thr->task;
+         struct gomp_task_depend_entry elem, *ent = NULL;
+         size_t ndepend = (uintptr_t) depend[0];
+         size_t nout = (uintptr_t) depend[1];
+         size_t i;
+         gomp_mutex_lock (&team->task_lock);
+         for (i = 0; i < ndepend; i++)
+           {
+             elem.addr = depend[i + 2];
+             ent = htab_find (parent->depend_hash, &elem);
+             for (; ent; ent = ent->next)
+               if (i >= nout && ent->is_in)
+                 continue;
+               else
+                 break;
+             if (ent)
+               break;
+           }
+         gomp_mutex_unlock (&team->task_lock);
+         if (ent)
+           goto defer;
+       }
+
       gomp_init_task (&task, thr->task, gomp_icv (false));
       task.kind = GOMP_TASK_IFFALSE;
       task.final_task = (thr->task && thr->task->final_task) || (flags & 2);
@@ -146,14 +209,20 @@ GOMP_task (void (*fn) (void *), void *da
     }
   else
     {
+     defer:;
       struct gomp_task *task;
       struct gomp_task *parent = thr->task;
       struct gomp_taskgroup *taskgroup = parent->taskgroup;
       char *arg;
       bool do_wake;
+      size_t depend_size = 0;
 
-      task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1);
-      arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1)
+      if (flags & 8)
+       depend_size = ((uintptr_t) depend[0]
+                      * sizeof (struct gomp_task_depend_entry));
+      task = gomp_malloc (sizeof (*task) + depend_size
+                         + arg_size + arg_align - 1);
+      arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1)
                      & ~(uintptr_t) (arg_align - 1));
       gomp_init_task (task, parent, gomp_icv (false));
       task->kind = GOMP_TASK_IFFALSE;
@@ -171,7 +240,6 @@ GOMP_task (void (*fn) (void *), void *da
       task->kind = GOMP_TASK_WAITING;
       task->fn = fn;
       task->fn_data = arg;
-      task->in_tied_task = true;
       task->final_task = (flags & 2) >> 1;
       gomp_mutex_lock (&team->task_lock);
       /* If parallel or taskgroup has been cancelled, don't start new
@@ -185,6 +253,117 @@ GOMP_task (void (*fn) (void *), void *da
          free (task);
          return;
        }
+      if (taskgroup)
+       taskgroup->num_children++;
+      if (depend_size)
+       {
+         size_t ndepend = (uintptr_t) depend[0];
+         size_t nout = (uintptr_t) depend[1];
+         size_t i;
+         hash_entry_type ent;
+
+         task->depend_count = ndepend;
+         task->num_dependees = 0;
+         if (parent->depend_hash == NULL)
+           parent->depend_hash
+             = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12);
+         for (i = 0; i < ndepend; i++)
+           {
+             task->depend[i].addr = depend[2 + i];
+             task->depend[i].next = NULL;
+             task->depend[i].prev = NULL;
+             task->depend[i].task = task;
+             task->depend[i].is_in = i >= nout;
+             task->depend[i].redundant = false;
+
+             hash_entry_type *slot
+               = htab_find_slot (&parent->depend_hash, &task->depend[i],
+                                 INSERT);
+             hash_entry_type out = NULL;
+             if (*slot)
+               {
+                 /* If multiple depends on the same task are the
+                    same, all but the first one are redundant.
+                    As inout/out come first, if any of them is
+                    inout/out, it will win, which is the right
+                    semantics.  */
+                 if ((*slot)->task == task)
+                   {
+                     task->depend[i].redundant = true;
+                     continue;
+                   }
+                 for (ent = *slot; ent; ent = ent->next)
+                   {
+                     /* depend(in:...) doesn't depend on earlier
+                        depend(in:...).  */
+                     if (i >= nout && ent->is_in)
+                       continue;
+
+                     if (!ent->is_in)
+                       out = ent;
+
+                     struct gomp_task *tsk = ent->task;
+                     if (tsk->dependers == NULL)
+                       {
+                         tsk->dependers
+                           = gomp_malloc (sizeof (struct gomp_dependers_vec)
+                                          + 6 * sizeof (struct gomp_task *));
+                         tsk->dependers->n_elem = 1;
+                         tsk->dependers->allocated = 6;
+                         tsk->dependers->elem[0] = task;
+                         task->num_dependees++;
+                         continue;
+                       }
+                     /* We already have some other dependency on tsk
+                        from earlier depend clause.  */
+                     else if (tsk->dependers->n_elem
+                              && (tsk->dependers->elem[tsk->dependers->n_elem
+                                                       - 1]
+                                  == task))
+                       continue;
+                     else if (tsk->dependers->n_elem
+                              == tsk->dependers->allocated)
+                       {
+                         tsk->dependers->allocated
+                           = tsk->dependers->allocated * 2 + 2;
+                         tsk->dependers
+                           = gomp_realloc (tsk->dependers,
+                                           sizeof (struct gomp_dependers_vec)
+                                           + (tsk->dependers->allocated
+                                              * sizeof (struct gomp_task *)));
+                       }
+                     tsk->dependers->elem[tsk->dependers->n_elem++] = task;
+                     task->num_dependees++;
+                   }
+                 task->depend[i].next = *slot;
+                 (*slot)->prev = &task->depend[i];
+               }
+             *slot = &task->depend[i];
+
+             /* There is no need to store more than one depend({,in}out:)
+                task per address in the hash table chain, because each out
+                depends on all earlier outs, thus it is enough to record
+                just the last depend({,in}out:).  For depend(in:), we need
+                to keep all of the previous ones not terminated yet, because
+                a later depend({,in}out:) might need to depend on all of
+                them.  So, if the new task's clause is depend({,in}out:),
+                we know there is at most one other depend({,in}out:) clause
+                in the list (out) and to maintain the invariant we now
+                need to remove it from the list.  */
+             if (!task->depend[i].is_in && out)
+               {
+                 if (out->next)
+                   out->next->prev = out->prev;
+                 out->prev->next = out->next;
+                 out->redundant = true;
+               }
+           }
+         if (task->num_dependees)
+           {
+             gomp_mutex_unlock (&team->task_lock);
+             return;
+           }
+       }
       if (parent->children)
        {
          task->next_child = parent->children;
@@ -259,12 +438,133 @@ gomp_task_run_pre (struct gomp_task *chi
        || (taskgroup && taskgroup->cancelled))
       && !child_task->copy_ctors_done)
     return true;
-  team->task_running_count++;
-  if (team->task_count == team->task_running_count)
-    gomp_team_barrier_clear_task_pending (&team->barrier);
   return false;
 }
 
+static void
+gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i;
+
+  for (i = 0; i < child_task->depend_count; i++)
+    if (!child_task->depend[i].redundant)
+      {
+       if (child_task->depend[i].next)
+         child_task->depend[i].next->prev = child_task->depend[i].prev;
+       if (child_task->depend[i].prev)
+         child_task->depend[i].prev->next = child_task->depend[i].next;
+       else
+         {
+           hash_entry_type *slot
+             = htab_find_slot (&parent->depend_hash, &child_task->depend[i],
+                               NO_INSERT);
+           if (*slot != &child_task->depend[i])
+             abort ();
+           if (child_task->depend[i].next)
+             *slot = child_task->depend[i].next;
+           else
+             htab_clear_slot (parent->depend_hash, slot);
+         }
+      }
+}
+
+static size_t
+gomp_task_run_post_handle_dependers (struct gomp_task *child_task,
+                                    struct gomp_team *team)
+{
+  struct gomp_task *parent = child_task->parent;
+  size_t i, count = child_task->dependers->n_elem, ret = 0;
+  for (i = 0; i < count; i++)
+    {
+      struct gomp_task *task = child_task->dependers->elem[i];
+      if (--task->num_dependees != 0)
+       continue;
+
+      struct gomp_taskgroup *taskgroup = task->taskgroup;
+      if (parent)
+       {
+         if (parent->children)
+           {
+             task->next_child = parent->children;
+             task->prev_child = parent->children->prev_child;
+             task->next_child->prev_child = task;
+             task->prev_child->next_child = task;
+           }
+         else
+           {
+             task->next_child = task;
+             task->prev_child = task;
+           }
+         parent->children = task;
+         if (parent->in_taskwait)
+           {
+             parent->in_taskwait = false;
+             gomp_sem_post (&parent->taskwait_sem);
+           }
+       }
+      if (taskgroup)
+       {
+         if (taskgroup->children)
+           {
+             task->next_taskgroup = taskgroup->children;
+             task->prev_taskgroup = taskgroup->children->prev_taskgroup;
+             task->next_taskgroup->prev_taskgroup = task;
+             task->prev_taskgroup->next_taskgroup = task;
+           }
+         else
+           {
+             task->next_taskgroup = task;
+             task->prev_taskgroup = task;
+           }
+         taskgroup->children = task;
+         if (taskgroup->in_taskgroup_wait)
+           {
+             taskgroup->in_taskgroup_wait = false;
+             gomp_sem_post (&taskgroup->taskgroup_sem);
+           }
+       }
+      if (team->task_queue)
+       {
+         task->next_queue = team->task_queue;
+         task->prev_queue = team->task_queue->prev_queue;
+         task->next_queue->prev_queue = task;
+         task->prev_queue->next_queue = task;
+       }
+      else
+       {
+         task->next_queue = task;
+         task->prev_queue = task;
+         team->task_queue = task;
+       }
+      ++team->task_count;
+      ++ret;
+    }
+  free (child_task->dependers);
+  child_task->dependers = NULL;
+  if (ret > 1)
+    gomp_team_barrier_set_task_pending (&team->barrier);
+  return ret;
+}
+
+static inline size_t
+gomp_task_run_post_handle_depend (struct gomp_task *child_task,
+                                 struct gomp_team *team)
+{
+  if (child_task->depend_count == 0)
+    return 0;
+
+  /* If parent is gone already, the hash table is freed and nothing
+     will use the hash table anymore, no need to remove anything from it.  */
+  if (child_task->parent != NULL)
+    gomp_task_run_post_handle_depend_hash (child_task);
+
+  if (child_task->dependers == NULL)
+    return 0;
+
+  return gomp_task_run_post_handle_dependers (child_task, team);
+}
+
 static inline void
 gomp_task_run_post_remove_parent (struct gomp_task *child_task)
 {
@@ -286,7 +586,10 @@ gomp_task_run_post_remove_parent (struct
         before the NULL is written.  */
       __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE);
       if (parent->in_taskwait)
-       gomp_sem_post (&parent->taskwait_sem);
+       {
+         parent->in_taskwait = false;
+         gomp_sem_post (&parent->taskwait_sem);
+       }
     }
 }
 
@@ -298,20 +601,29 @@ gomp_task_run_post_remove_taskgroup (str
     return;
   child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup;
   child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup;
-  if (taskgroup->children != child_task)
-    return;
-  if (child_task->next_taskgroup != child_task)
-    taskgroup->children = child_task->next_taskgroup;
+  if (taskgroup->num_children > 1)
+    --taskgroup->num_children;
   else
     {
-      /* We access task->children in GOMP_taskgroup_end
+      /* We access taskgroup->num_children in GOMP_taskgroup_end
         outside of the task lock mutex region, so
         need a release barrier here to ensure memory
         written by child_task->fn above is flushed
         before the NULL is written.  */
-      __atomic_store_n (&taskgroup->children, NULL, MEMMODEL_RELEASE);
+      __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE);
+    }
+  if (taskgroup->children != child_task)
+    return;
+  if (child_task->next_taskgroup != child_task)
+    taskgroup->children = child_task->next_taskgroup;
+  else
+    {
+      taskgroup->children = NULL;
       if (taskgroup->in_taskgroup_wait)
-       gomp_sem_post (&taskgroup->taskgroup_sem);
+       {
+         taskgroup->in_taskgroup_wait = false;
+         gomp_sem_post (&taskgroup->taskgroup_sem);
+       }
     }
 }
 
@@ -323,6 +635,7 @@ gomp_barrier_handle_tasks (gomp_barrier_
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   gomp_mutex_lock (&team->task_lock);
   if (gomp_barrier_last_thread (state))
@@ -355,8 +668,17 @@ gomp_barrier_handle_tasks (gomp_barrier_
                }
              goto finish_cancelled;
            }
+         team->task_running_count++;
+         child_task->in_tied_task = true;
+         if (team->task_count == team->task_running_count)
+           gomp_team_barrier_clear_task_pending (&team->barrier);
        }
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+       {
+         gomp_team_barrier_wake (&team->barrier, do_wake);
+         do_wake = 0;
+       }
       if (to_free)
        {
          gomp_finish_task (to_free);
@@ -374,7 +696,9 @@ gomp_barrier_handle_tasks (gomp_barrier_
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
        {
-        finish_cancelled:
+        finish_cancelled:;
+         size_t new_tasks
+           = gomp_task_run_post_handle_depend (child_task, team);
          gomp_task_run_post_remove_parent (child_task);
          gomp_clear_parent (child_task->children);
          gomp_task_run_post_remove_taskgroup (child_task);
@@ -382,6 +706,12 @@ gomp_barrier_handle_tasks (gomp_barrier_
          child_task = NULL;
          if (!cancelled)
            team->task_running_count--;
+         if (new_tasks > 1)
+           {
+             do_wake = team->nthreads - team->task_running_count;
+             if (do_wake > new_tasks)
+               do_wake = new_tasks;
+           }
          if (--team->task_count == 0
              && gomp_team_barrier_waiting_for_tasks (&team->barrier))
            {
@@ -404,9 +734,10 @@ GOMP_taskwait (void)
   struct gomp_task *task = thr->task;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   /* The acquire barrier on load of task->children here synchronizes
-     with the write of a NULL in gomp_barrier_handle_tasks.  It is
+     with the write of a NULL in gomp_task_run_post_remove_parent.  It is
      not necessary that we synchronize with other non-NULL writes at
      this point, but we must ensure that all writes to memory by a
      child thread task work function are seen before we exit from
@@ -451,6 +782,11 @@ GOMP_taskwait (void)
           in other threads.  Wait for them.  */
        task->in_taskwait = true;
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+       {
+         gomp_team_barrier_wake (&team->barrier, do_wake);
+         do_wake = 0;
+       }
       if (to_free)
        {
          gomp_finish_task (to_free);
@@ -464,15 +800,13 @@ GOMP_taskwait (void)
          thr->task = task;
        }
       else
-       {
-         gomp_sem_wait (&task->taskwait_sem);
-         task->in_taskwait = false;
-         return;
-       }
+       gomp_sem_wait (&task->taskwait_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
        {
-        finish_cancelled:
+        finish_cancelled:;
+         size_t new_tasks
+           = gomp_task_run_post_handle_depend (child_task, team);
          child_task->prev_child->next_child = child_task->next_child;
          child_task->next_child->prev_child = child_task->prev_child;
          if (task->children == child_task)
@@ -487,7 +821,13 @@ GOMP_taskwait (void)
          to_free = child_task;
          child_task = NULL;
          team->task_count--;
-         team->task_running_count--;
+         if (new_tasks > 1)
+           {
+             do_wake = team->nthreads - team->task_running_count
+                       - !task->in_tied_task;
+             if (do_wake > new_tasks)
+               do_wake = new_tasks;
+           }
        }
     }
 }
@@ -519,6 +859,7 @@ GOMP_taskgroup_start (void)
   taskgroup->children = NULL;
   taskgroup->in_taskgroup_wait = false;
   taskgroup->cancelled = false;
+  taskgroup->num_children = 0;
   gomp_sem_init (&taskgroup->taskgroup_sem, 0);
   task->taskgroup = taskgroup;
 }
@@ -532,18 +873,29 @@ GOMP_taskgroup_end (void)
   struct gomp_taskgroup *taskgroup;
   struct gomp_task *child_task = NULL;
   struct gomp_task *to_free = NULL;
+  int do_wake = 0;
 
   if (team == NULL)
     return;
   taskgroup = task->taskgroup;
-  if (__atomic_load_n (&taskgroup->children, MEMMODEL_ACQUIRE) == NULL)
+
+  /* The acquire barrier on load of taskgroup->num_children here
+     synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup.
+     It is not necessary that we synchronize with other non-0 writes at
+     this point, but we must ensure that all writes to memory by a
+     child thread task work function are seen before we exit from
+     GOMP_taskgroup_end.  */
+  if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0)
     goto finish;
+
   gomp_mutex_lock (&team->task_lock);
   while (1)
     {
       bool cancelled = false;
       if (taskgroup->children == NULL)
        {
+         if (taskgroup->num_children)
+           goto do_wait;
          gomp_mutex_unlock (&team->task_lock);
          if (to_free)
            {
@@ -570,10 +922,18 @@ GOMP_taskgroup_end (void)
            }
        }
       else
-       /* All tasks we are waiting for are already running
-          in other threads.  Wait for them.  */
-       taskgroup->in_taskgroup_wait = true;
+       {
+        do_wait:
+         /* All tasks we are waiting for are already running
+            in other threads.  Wait for them.  */
+         taskgroup->in_taskgroup_wait = true;
+       }
       gomp_mutex_unlock (&team->task_lock);
+      if (do_wake)
+       {
+         gomp_team_barrier_wake (&team->barrier, do_wake);
+         do_wake = 0;
+       }
       if (to_free)
        {
          gomp_finish_task (to_free);
@@ -587,19 +947,18 @@ GOMP_taskgroup_end (void)
          thr->task = task;
        }
       else
-       {
-         gomp_sem_wait (&taskgroup->taskgroup_sem);
-         taskgroup->in_taskgroup_wait = false;
-         goto finish;
-       }
+       gomp_sem_wait (&taskgroup->taskgroup_sem);
       gomp_mutex_lock (&team->task_lock);
       if (child_task)
        {
-        finish_cancelled:
+        finish_cancelled:;
+         size_t new_tasks
+           = gomp_task_run_post_handle_depend (child_task, team);
          child_task->prev_taskgroup->next_taskgroup
            = child_task->next_taskgroup;
          child_task->next_taskgroup->prev_taskgroup
            = child_task->prev_taskgroup;
+         --taskgroup->num_children;
          if (taskgroup->children == child_task)
            {
              if (child_task->next_taskgroup != child_task)
@@ -612,7 +971,13 @@ GOMP_taskgroup_end (void)
          to_free = child_task;
          child_task = NULL;
          team->task_count--;
-         team->task_running_count--;
+         if (new_tasks > 1)
+           {
+             do_wake = team->nthreads - team->task_running_count
+                       - !task->in_tied_task;
+             if (do_wake > new_tasks)
+               do_wake = new_tasks;
+           }
        }
     }
 
--- libgomp/testsuite/libgomp.c/depend-4.c.jj   2013-09-27 11:42:37.283473918 
+0200
+++ libgomp/testsuite/libgomp.c/depend-4.c      2013-09-27 11:49:37.781239095 
+0200
@@ -0,0 +1,56 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int
+main ()
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1, y = 2, z = 3;
+    #pragma omp taskgroup
+    {
+      #pragma omp task shared (x, y, z) depend(inout: x, y) \
+                      depend (in: z) if (x > 10)
+      {
+       if (x != 1 || y != 2 || z != 3)
+         abort ();
+       x = 4;
+       y = 5;
+      }
+      /* The above task has depend clauses, but no dependencies
+        on earlier tasks, and is if (0), so must be scheduled
+        immediately.  */
+      if (x != 4 || y != 5)
+       abort ();
+    }
+    #pragma omp taskgroup
+    {
+      #pragma omp task shared (x, y) depend(in: x, y)
+      {
+       usleep (10000);
+       if (x != 4 || y != 5 || z != 3)
+         abort ();
+      }
+      #pragma omp task shared (x, y) depend(in: x, y)
+      {
+       usleep (10000);
+       if (x != 4 || y != 5 || z != 3)
+         abort ();
+      }
+      #pragma omp task shared (x, y, z) depend(inout: x, y) \
+                      depend (in: z) if (x > 10)
+      {
+       if (x != 4 || y != 5 || z != 3)
+         abort ();
+       x = 6;
+       y = 7;
+      }
+      /* The above task has depend clauses, and may have dependencies
+        on earlier tasks, while it is if (0), it can be deferred.  */
+    }
+    if (x != 6 || y != 7)
+      abort ();
+  }
+  return 0;
+}
--- libgomp/testsuite/libgomp.c/depend-1.c.jj   2013-09-26 17:57:26.011983435 
+0200
+++ libgomp/testsuite/libgomp.c/depend-1.c      2013-09-26 21:09:57.128895308 
+0200
@@ -0,0 +1,215 @@
+#include <stdlib.h>
+
+void
+dep (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+dep2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp taskwait
+  }
+}
+
+void
+dep3 (void)
+{
+  #pragma omp parallel
+  {
+    int x = 1;
+    #pragma omp single
+    {
+      #pragma omp task shared (x) depend(out: x)
+      x = 2;
+      #pragma omp task shared (x) depend(in: x)
+      if (x != 2)
+       abort ();
+    }
+  }
+}
+
+void
+firstpriv (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task depend(out: x)
+    x = 2;
+    #pragma omp task depend(in: x)
+    if (x != 1)
+      abort ();
+  }
+}
+
+void
+antidep (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared(x) depend(in: x)
+    if (x != 1)
+      abort ();
+    #pragma omp task shared(x) depend(out: x)
+    x = 2;
+  }
+}
+
+void
+antidep2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp taskgroup
+    {
+      #pragma omp task shared(x) depend(in: x)
+      if (x != 1)
+       abort ();
+      #pragma omp task shared(x) depend(out: x)
+      x = 2;
+    }
+  }
+}
+
+void
+antidep3 (void)
+{
+  #pragma omp parallel
+  {
+    int x = 1;
+    #pragma omp single
+    {
+      #pragma omp task shared(x) depend(in: x)
+      if (x != 1)
+       abort ();
+      #pragma omp task shared(x) depend(out: x)
+      x = 2;
+    }
+  }
+}
+
+
+void
+outdep (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 0;
+    #pragma omp task shared(x) depend(out: x)
+    x = 1;
+    #pragma omp task shared(x) depend(out: x)
+    x = 2;
+    #pragma omp taskwait
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+concurrent (void)
+{
+  int x = 1;
+  #pragma omp parallel
+  #pragma omp single
+  {
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+  }
+}
+
+void
+concurrent2 (void)
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1;
+    #pragma omp task shared (x) depend(out: x)
+    x = 2;
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp task shared (x) depend(in: x)
+    if (x != 2)
+      abort ();
+    #pragma omp taskwait
+  }
+}
+
+void
+concurrent3 (void)
+{
+  #pragma omp parallel
+  {
+    int x = 1;
+    #pragma omp single
+    {
+      #pragma omp task shared (x) depend(out: x)
+      x = 2;
+      #pragma omp task shared (x) depend(in: x)
+      if (x != 2)
+       abort ();
+      #pragma omp task shared (x) depend(in: x)
+      if (x != 2)
+       abort ();
+      #pragma omp task shared (x) depend(in: x)
+      if (x != 2)
+       abort ();
+    }
+  }
+}
+
+int
+main ()
+{
+  dep ();
+  dep2 ();
+  dep3 ();
+  firstpriv ();
+  antidep ();
+  antidep2 ();
+  antidep3 ();
+  outdep ();
+  concurrent ();
+  concurrent2 ();
+  concurrent3 ();
+  return 0;
+}
--- libgomp/testsuite/libgomp.c/depend-2.c.jj   2013-09-26 18:56:19.808294100 
+0200
+++ libgomp/testsuite/libgomp.c/depend-2.c      2013-09-26 19:46:29.732123749 
+0200
@@ -0,0 +1,71 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+void
+foo (int do_sleep)
+{
+  int a[64], i, *p = a + 4, x = 0;
+  asm volatile ("" : "+r" (p));
+  for (i = 0; i < 64; i++)
+    a[i] = i + 8;
+  #pragma omp parallel private (i)
+  {
+    #pragma omp single nowait
+    {
+      for (i = 0; i < 8; i++)
+       {
+         #pragma omp task depend(out: a[i * 8 : 4])
+           a[i * 8] += (i + 2) * 9;
+         #pragma omp task depend(out: p[i * 8 : 2])
+           p[i * 8] += (i + 3) * 10;
+         #pragma omp task depend(out: x)
+           x = 1;
+       }
+      for (i = 0; i < 8; i++)
+       #pragma omp task depend(in: a[i * 8 : 4]) \
+                        depend(inout: a[i * 8 + 4 : 2]) \
+                        depend(in: a[0 : 4]) depend(in: x)
+       {
+         if (a[0] != 8 + 2 * 9 || x != 1)
+           abort ();
+         if (a[i * 8] != i * 8 + 8 + (i + 2) * 9)
+           abort ();
+         if (a[4 + i * 8] != 4 + i * 8 + 8 + (i + 3) * 10)
+           abort ();
+         p[i * 8] += a[i * 8];
+       }
+      for (i = 0; i < 8; i++)
+       #pragma omp task depend(inout: a[i * 8 : 4]) \
+                        depend(in: p[i * 8 : 2]) \
+                        depend(in: p[0 : 2], x)
+       {
+         if (p[0] != 4 + 8 + 3 * 10 + 0 + 8 + 2 * 9 || x != 1)
+           abort ();
+         if (a[i * 8] != i * 8 + 8 + (i + 2) * 9)
+           abort ();
+         if (a[4 + i * 8] != (4 + i * 8 + 8 + (i + 3) * 10
+                              + i * 8 + 8 + (i + 2) * 9))
+           abort ();
+         a[i * 8] += 2;
+       }
+      for (i = 0; i < 4; i++)
+       #pragma omp task depend(in: a[i * 16 : 4], a[i * 16 + 8 : 4], x)
+       {
+         if (a[i * 16] != i * 16 + 8 + (2 * i + 2) * 9 + 2 || x != 1)
+           abort ();
+         if (p[i * 16 + 4] != i * 16 + 8 + 8 + (2 * i + 1 + 2) * 9 + 2)
+           abort ();
+       }
+    }
+    if (do_sleep)
+      sleep (1);
+  }
+}
+
+int
+main ()
+{
+  foo (1);
+  foo (0);
+  return 0;
+}
--- libgomp/testsuite/libgomp.c/depend-3.c.jj   2013-09-27 11:32:44.410621977 
+0200
+++ libgomp/testsuite/libgomp.c/depend-3.c      2013-09-27 11:39:25.500493830 
+0200
@@ -0,0 +1,51 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+int
+main ()
+{
+  #pragma omp parallel
+  #pragma omp single
+  {
+    int x = 1, y = 2;
+    #pragma omp taskgroup
+    {
+      #pragma omp task shared (x) depend(in: x)
+      {
+       usleep (10000);
+       if (x != 1)
+         abort ();
+      }
+      #pragma omp taskgroup
+      {
+       #pragma omp task shared (x) depend(in: x)
+       {
+         usleep (15000);
+         if (x != 1)
+           abort ();
+       }
+       #pragma omp task shared (y) depend(inout: y)
+       {
+         if (y != 2)
+           abort ();
+         y = 3;
+       }
+       #pragma omp taskgroup
+       {
+         #pragma omp task shared (x) depend(in: x)
+         {
+           usleep (13000);
+           if (x != 1)
+             abort ();
+         }
+         #pragma omp taskgroup
+         {
+           #pragma omp task shared (x) depend(out: x)
+           x = 2;
+         }
+       }
+      }
+    }
+  }
+  return 0;
+}


        Jakub

Re: [gomp4] Library side of depend clause support

Reply via email to