On Fri, Sep 27, 2013 at 01:48:36AM +0200, Jakub Jelinek wrote: > Perhaps. What if I do just minor cleanup (use flexible array members for > the reallocated vectors, and perhaps keep only the last out/inout task > in the hash table chains rather than all of them), retest, commit and then > we can discuss/incrementally improve it?
Here is what I've committed now, the incremental changes were really only using a structure with flex array member for the dependers vectors, removing/making redundant earlier !ent->is_in when adding !is_in into the chain and addition of new testcases. Let's improve it incrementally later. 2013-09-27 Jakub Jelinek <ja...@redhat.com> * libgomp.h: Include stdlib.h. (struct gomp_task_depend_entry, struct gomp_dependers_vec): New types. (struct gomp_task): Add dependers, depend_hash, depend_count, num_dependees and depend fields. (struct gomp_taskgroup): Add num_children field. (gomp_finish_task): Free depend_hash if non-NULL. * libgomp_g.h (GOMP_task): Add depend argument. * hashtab.h: New file. * task.c: Include hashtab.h. (hash_entry_type): New typedef. (htab_alloc, htab_free, htab_hash, htab_eq): New inlines. (gomp_init_task): Clear dependers, depend_hash and depend_count fields. (GOMP_task): Add depend argument, handle depend clauses. Increment num_children field in taskgroup. (gomp_task_run_pre): Don't increment task_running_count here, nor clear task_pending bit. (gomp_task_run_post_handle_depend_hash, gomp_task_run_post_handle_dependers, gomp_task_run_post_handle_depend): New functions. (gomp_task_run_post_remove_parent): Clear in_taskwait before signalling corresponding semaphore. (gomp_task_run_post_remove_taskgroup): Decrement num_children field and make the decrement to 0 MEMMODEL_RELEASE operation, rather than storing NULL to taskgroup->children. Clear in_taskgroup_wait before signalling corresponding semaphore. (gomp_barrier_handle_tasks): Move task_running_count increment and task_pending bit clearing here. Call gomp_task_run_post_handle_depend. If more than one new tasks have been queued, wake other threads if needed. (GOMP_taskwait): Call gomp_task_run_post_handle_depend. If more than one new tasks have been queued, wake other threads if needed. After waiting on taskwait_sem, enter critical section again. (GOMP_taskgroup_start): Initialize num_children field. (GOMP_taskgroup_end): Check num_children instead of children before critical section. If children is NULL, but num_children is non-zero, wait on taskgroup_sem. Call gomp_task_run_post_handle_depend. If more than one new tasks have been queued, wake other threads if needed. After waiting on taskgroup_sem, enter critical section again. * testsuite/libgomp.c/depend-1.c: New test. * testsuite/libgomp.c/depend-2.c: New test. * testsuite/libgomp.c/depend-3.c: New test. * testsuite/libgomp.c/depend-4.c: New test. --- libgomp/libgomp.h.jj 2013-09-26 09:43:10.903930832 +0200 +++ libgomp/libgomp.h 2013-09-27 09:05:17.025402127 +0200 @@ -39,6 +39,7 @@ #include <pthread.h> #include <stdbool.h> +#include <stdlib.h> #ifdef HAVE_ATTRIBUTE_VISIBILITY # pragma GCC visibility push(hidden) @@ -253,7 +254,26 @@ enum gomp_task_kind GOMP_TASK_TIED }; +struct gomp_task; struct gomp_taskgroup; +struct htab; + +struct gomp_task_depend_entry +{ + void *addr; + struct gomp_task_depend_entry *next; + struct gomp_task_depend_entry *prev; + struct gomp_task *task; + bool is_in; + bool redundant; +}; + +struct gomp_dependers_vec +{ + size_t n_elem; + size_t allocated; + struct gomp_task *elem[]; +}; /* This structure describes a "task" to be run by a thread. */ @@ -268,6 +288,10 @@ struct gomp_task struct gomp_task *next_taskgroup; struct gomp_task *prev_taskgroup; struct gomp_taskgroup *taskgroup; + struct gomp_dependers_vec *dependers; + struct htab *depend_hash; + size_t depend_count; + size_t num_dependees; struct gomp_task_icv icv; void (*fn) (void *); void *fn_data; @@ -277,6 +301,7 @@ struct gomp_task bool final_task; bool copy_ctors_done; gomp_sem_t taskwait_sem; + struct gomp_task_depend_entry depend[]; }; struct gomp_taskgroup @@ -286,6 +311,7 @@ struct gomp_taskgroup bool in_taskgroup_wait; bool cancelled; gomp_sem_t taskgroup_sem; + size_t num_children; }; /* This structure describes a "team" of threads. These are the threads @@ -525,6 +551,8 @@ extern void gomp_barrier_handle_tasks (g static void inline gomp_finish_task (struct gomp_task *task) { + if (__builtin_expect (task->depend_hash != NULL, 0)) + free (task->depend_hash); gomp_sem_destroy (&task->taskwait_sem); } --- libgomp/libgomp_g.h.jj 2013-09-26 09:43:10.902930838 +0200 +++ libgomp/libgomp_g.h 2013-09-26 10:08:44.820160094 +0200 @@ -178,7 +178,7 @@ extern bool GOMP_cancellation_point (int /* task.c */ extern void GOMP_task (void (*) (void *), void *, void (*) (void *, void *), - long, long, bool, unsigned); + long, long, bool, unsigned, void **); extern void GOMP_taskwait (void); extern void GOMP_taskyield (void); extern void GOMP_taskgroup_start (void); --- libgomp/hashtab.h.jj 2013-09-26 10:08:51.031128932 +0200 +++ libgomp/hashtab.h 2013-09-26 21:07:17.757697391 +0200 @@ -0,0 +1,443 @@ +/* An expandable hash tables datatype. + Copyright (C) 1999-2013 + Free Software Foundation, Inc. + Contributed by Vladimir Makarov <vmaka...@cygnus.com>. + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* The hash table code copied from include/hashtab.[hc] and adjusted, + so that the hash table entries are in the flexible array at the end + of the control structure, no callbacks are used and the elements in the + table are of the hash_entry_type type. + Before including this file, define hash_entry_type type and + htab_alloc and htab_free functions. After including it, define + htab_hash and htab_eq inline functions. */ + +/* This package implements basic hash table functionality. It is possible + to search for an entry, create an entry and destroy an entry. + + Elements in the table are generic pointers. + + The size of the table is not fixed; if the occupancy of the table + grows too high the hash table will be expanded. + + The abstract data implementation is based on generalized Algorithm D + from Knuth's book "The art of computer programming". Hash table is + expanded by creation of new hash table and transferring elements from + the old table to the new table. */ + +/* The type for a hash code. */ +typedef unsigned int hashval_t; + +static inline hashval_t htab_hash (hash_entry_type); +static inline bool htab_eq (hash_entry_type, hash_entry_type); + +/* This macro defines reserved value for empty table entry. */ + +#define HTAB_EMPTY_ENTRY ((hash_entry_type) 0) + +/* This macro defines reserved value for table entry which contained + a deleted element. */ + +#define HTAB_DELETED_ENTRY ((hash_entry_type) 1) + +/* Hash tables are of the following type. The structure + (implementation) of this type is not needed for using the hash + tables. All work with hash table should be executed only through + functions mentioned below. The size of this structure is subject to + change. */ + +struct htab { + /* Current size (in entries) of the hash table. */ + size_t size; + + /* Current number of elements including also deleted elements. */ + size_t n_elements; + + /* Current number of deleted elements in the table. */ + size_t n_deleted; + + /* Current size (in entries) of the hash table, as an index into the + table of primes. */ + unsigned int size_prime_index; + + /* Table itself. */ + hash_entry_type entries[]; +}; + +typedef struct htab *htab_t; + +/* An enum saying whether we insert into the hash table or not. */ +enum insert_option {NO_INSERT, INSERT}; + +/* Table of primes and multiplicative inverses. + + Note that these are not minimally reduced inverses. Unlike when generating + code to divide by a constant, we want to be able to use the same algorithm + all the time. All of these inverses (are implied to) have bit 32 set. + + For the record, the function that computed the table is in + libiberty/hashtab.c. */ + +struct prime_ent +{ + hashval_t prime; + hashval_t inv; + hashval_t inv_m2; /* inverse of prime-2 */ + hashval_t shift; +}; + +static struct prime_ent const prime_tab[] = { + { 7, 0x24924925, 0x9999999b, 2 }, + { 13, 0x3b13b13c, 0x745d1747, 3 }, + { 31, 0x08421085, 0x1a7b9612, 4 }, + { 61, 0x0c9714fc, 0x15b1e5f8, 5 }, + { 127, 0x02040811, 0x0624dd30, 6 }, + { 251, 0x05197f7e, 0x073260a5, 7 }, + { 509, 0x01824366, 0x02864fc8, 8 }, + { 1021, 0x00c0906d, 0x014191f7, 9 }, + { 2039, 0x0121456f, 0x0161e69e, 10 }, + { 4093, 0x00300902, 0x00501908, 11 }, + { 8191, 0x00080041, 0x00180241, 12 }, + { 16381, 0x000c0091, 0x00140191, 13 }, + { 32749, 0x002605a5, 0x002a06e6, 14 }, + { 65521, 0x000f00e2, 0x00110122, 15 }, + { 131071, 0x00008001, 0x00018003, 16 }, + { 262139, 0x00014002, 0x0001c004, 17 }, + { 524287, 0x00002001, 0x00006001, 18 }, + { 1048573, 0x00003001, 0x00005001, 19 }, + { 2097143, 0x00004801, 0x00005801, 20 }, + { 4194301, 0x00000c01, 0x00001401, 21 }, + { 8388593, 0x00001e01, 0x00002201, 22 }, + { 16777213, 0x00000301, 0x00000501, 23 }, + { 33554393, 0x00001381, 0x00001481, 24 }, + { 67108859, 0x00000141, 0x000001c1, 25 }, + { 134217689, 0x000004e1, 0x00000521, 26 }, + { 268435399, 0x00000391, 0x000003b1, 27 }, + { 536870909, 0x00000019, 0x00000029, 28 }, + { 1073741789, 0x0000008d, 0x00000095, 29 }, + { 2147483647, 0x00000003, 0x00000007, 30 }, + /* Avoid "decimal constant so large it is unsigned" for 4294967291. */ + { 0xfffffffb, 0x00000006, 0x00000008, 31 } +}; + +/* The following function returns an index into the above table of the + nearest prime number which is greater than N, and near a power of two. */ + +static unsigned int +higher_prime_index (unsigned long n) +{ + unsigned int low = 0; + unsigned int high = sizeof(prime_tab) / sizeof(prime_tab[0]); + + while (low != high) + { + unsigned int mid = low + (high - low) / 2; + if (n > prime_tab[mid].prime) + low = mid + 1; + else + high = mid; + } + + /* If we've run out of primes, abort. */ + if (n > prime_tab[low].prime) + abort (); + + return low; +} + +/* Return the current size of given hash table. */ + +static inline size_t +htab_size (htab_t htab) +{ + return htab->size; +} + +/* Return the current number of elements in given hash table. */ + +static inline size_t +htab_elements (htab_t htab) +{ + return htab->n_elements - htab->n_deleted; +} + +/* Return X % Y. */ + +static inline hashval_t +htab_mod_1 (hashval_t x, hashval_t y, hashval_t inv, int shift) +{ + /* The multiplicative inverses computed above are for 32-bit types, and + requires that we be able to compute a highpart multiply. */ + if (sizeof (hashval_t) * __CHAR_BIT__ <= 32) + { + hashval_t t1, t2, t3, t4, q, r; + + t1 = ((unsigned long long)x * inv) >> 32; + t2 = x - t1; + t3 = t2 >> 1; + t4 = t1 + t3; + q = t4 >> shift; + r = x - (q * y); + + return r; + } + + /* Otherwise just use the native division routines. */ + return x % y; +} + +/* Compute the primary hash for HASH given HTAB's current size. */ + +static inline hashval_t +htab_mod (hashval_t hash, htab_t htab) +{ + const struct prime_ent *p = &prime_tab[htab->size_prime_index]; + return htab_mod_1 (hash, p->prime, p->inv, p->shift); +} + +/* Compute the secondary hash for HASH given HTAB's current size. */ + +static inline hashval_t +htab_mod_m2 (hashval_t hash, htab_t htab) +{ + const struct prime_ent *p = &prime_tab[htab->size_prime_index]; + return 1 + htab_mod_1 (hash, p->prime - 2, p->inv_m2, p->shift); +} + +/* Create hash table of size SIZE. */ + +static htab_t +htab_create (size_t size) +{ + htab_t result; + unsigned int size_prime_index; + + size_prime_index = higher_prime_index (size); + size = prime_tab[size_prime_index].prime; + + result = (htab_t) htab_alloc (sizeof (struct htab) + + size * sizeof (hash_entry_type)); + result->size = size; + result->n_elements = 0; + result->n_deleted = 0; + result->size_prime_index = size_prime_index; + memset (result->entries, 0, size * sizeof (hash_entry_type)); + return result; +} + +/* Similar to htab_find_slot, but without several unwanted side effects: + - Does not call htab_eq when it finds an existing entry. + - Does not change the count of elements in the hash table. + This function also assumes there are no deleted entries in the table. + HASH is the hash value for the element to be inserted. */ + +static hash_entry_type * +find_empty_slot_for_expand (htab_t htab, hashval_t hash) +{ + hashval_t index = htab_mod (hash, htab); + size_t size = htab_size (htab); + hash_entry_type *slot = htab->entries + index; + hashval_t hash2; + + if (*slot == HTAB_EMPTY_ENTRY) + return slot; + else if (*slot == HTAB_DELETED_ENTRY) + abort (); + + hash2 = htab_mod_m2 (hash, htab); + for (;;) + { + index += hash2; + if (index >= size) + index -= size; + + slot = htab->entries + index; + if (*slot == HTAB_EMPTY_ENTRY) + return slot; + else if (*slot == HTAB_DELETED_ENTRY) + abort (); + } +} + +/* The following function changes size of memory allocated for the + entries and repeatedly inserts the table elements. The occupancy + of the table after the call will be about 50%. Naturally the hash + table must already exist. Remember also that the place of the + table entries is changed. */ + +static htab_t +htab_expand (htab_t htab) +{ + htab_t nhtab; + hash_entry_type *olimit; + hash_entry_type *p; + size_t osize, elts; + + osize = htab->size; + olimit = htab->entries + osize; + elts = htab_elements (htab); + + /* Resize only when table after removal of unused elements is either + too full or too empty. */ + if (elts * 2 > osize || (elts * 8 < osize && osize > 32)) + nhtab = htab_create (elts * 2); + else + nhtab = htab_create (osize - 1); + nhtab->n_elements = htab->n_elements - htab->n_deleted; + + p = htab->entries; + do + { + hash_entry_type x = *p; + + if (x != HTAB_EMPTY_ENTRY && x != HTAB_DELETED_ENTRY) + *find_empty_slot_for_expand (nhtab, htab_hash (x)) = x; + + p++; + } + while (p < olimit); + + htab_free (htab); + return nhtab; +} + +/* This function searches for a hash table entry equal to the given + element. It cannot be used to insert or delete an element. */ + +static hash_entry_type +htab_find (htab_t htab, const hash_entry_type element) +{ + hashval_t index, hash2, hash = htab_hash (element); + size_t size; + hash_entry_type entry; + + size = htab_size (htab); + index = htab_mod (hash, htab); + + entry = htab->entries[index]; + if (entry == HTAB_EMPTY_ENTRY + || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element))) + return entry; + + hash2 = htab_mod_m2 (hash, htab); + for (;;) + { + index += hash2; + if (index >= size) + index -= size; + + entry = htab->entries[index]; + if (entry == HTAB_EMPTY_ENTRY + || (entry != HTAB_DELETED_ENTRY && htab_eq (entry, element))) + return entry; + } +} + +/* This function searches for a hash table slot containing an entry + equal to the given element. To delete an entry, call this with + insert=NO_INSERT, then call htab_clear_slot on the slot returned + (possibly after doing some checks). To insert an entry, call this + with insert=INSERT, then write the value you want into the returned + slot. */ + +static hash_entry_type * +htab_find_slot (htab_t *htabp, const hash_entry_type element, + enum insert_option insert) +{ + hash_entry_type *first_deleted_slot; + hashval_t index, hash2, hash = htab_hash (element); + size_t size; + hash_entry_type entry; + htab_t htab = *htabp; + + size = htab_size (htab); + if (insert == INSERT && size * 3 <= htab->n_elements * 4) + { + htab = *htabp = htab_expand (htab); + size = htab_size (htab); + } + + index = htab_mod (hash, htab); + + first_deleted_slot = NULL; + + entry = htab->entries[index]; + if (entry == HTAB_EMPTY_ENTRY) + goto empty_entry; + else if (entry == HTAB_DELETED_ENTRY) + first_deleted_slot = &htab->entries[index]; + else if (htab_eq (entry, element)) + return &htab->entries[index]; + + hash2 = htab_mod_m2 (hash, htab); + for (;;) + { + index += hash2; + if (index >= size) + index -= size; + + entry = htab->entries[index]; + if (entry == HTAB_EMPTY_ENTRY) + goto empty_entry; + else if (entry == HTAB_DELETED_ENTRY) + { + if (!first_deleted_slot) + first_deleted_slot = &htab->entries[index]; + } + else if (htab_eq (entry, element)) + return &htab->entries[index]; + } + + empty_entry: + if (insert == NO_INSERT) + return NULL; + + if (first_deleted_slot) + { + htab->n_deleted--; + *first_deleted_slot = HTAB_EMPTY_ENTRY; + return first_deleted_slot; + } + + htab->n_elements++; + return &htab->entries[index]; +} + +/* This function clears a specified slot in a hash table. It is + useful when you've already done the lookup and don't want to do it + again. */ + +static inline void +htab_clear_slot (htab_t htab, hash_entry_type *slot) +{ + if (slot < htab->entries || slot >= htab->entries + htab_size (htab) + || *slot == HTAB_EMPTY_ENTRY || *slot == HTAB_DELETED_ENTRY) + abort (); + + *slot = HTAB_DELETED_ENTRY; + htab->n_deleted++; +} + +/* Returns a hash code for pointer P. Simplified version of evahash */ + +static inline hashval_t +hash_pointer (const void *p) +{ + uintptr_t v = (uintptr_t) p; + if (sizeof (v) > sizeof (hashval_t)) + v ^= v >> (sizeof (uintptr_t) / 2 * __CHAR_BIT__); + return v; +} --- libgomp/task.c.jj 2013-09-26 09:43:10.903930832 +0200 +++ libgomp/task.c 2013-09-27 09:30:57.798187840 +0200 @@ -29,6 +29,33 @@ #include <stdlib.h> #include <string.h> +typedef struct gomp_task_depend_entry *hash_entry_type; + +static inline void * +htab_alloc (size_t size) +{ + return gomp_malloc (size); +} + +static inline void +htab_free (void *ptr) +{ + free (ptr); +} + +#include "hashtab.h" + +static inline hashval_t +htab_hash (hash_entry_type element) +{ + return hash_pointer (element->addr); +} + +static inline bool +htab_eq (hash_entry_type x, hash_entry_type y) +{ + return x->addr == y->addr; +} /* Create a new task data structure. */ @@ -45,6 +72,9 @@ gomp_init_task (struct gomp_task *task, task->copy_ctors_done = false; task->children = NULL; task->taskgroup = NULL; + task->dependers = NULL; + task->depend_hash = NULL; + task->depend_count = 0; gomp_sem_init (&task->taskwait_sem, 0); } @@ -80,7 +110,8 @@ gomp_clear_parent (struct gomp_task *chi void GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) (void *, void *), - long arg_size, long arg_align, bool if_clause, unsigned flags) + long arg_size, long arg_align, bool if_clause, unsigned flags, + void **depend) { struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; @@ -108,6 +139,38 @@ GOMP_task (void (*fn) (void *), void *da { struct gomp_task task; + /* If there are depend clauses and earlier deferred sibling tasks + with depend clauses, check if there isn't a dependency. If there + is, fall through to the deferred task handling, as we can't + schedule such tasks right away. There is no need to handle + depend clauses for non-deferred tasks other than this, because + the parent task is suspended until the child task finishes and thus + it can't start further child tasks. */ + if ((flags & 8) && thr->task && thr->task->depend_hash) + { + struct gomp_task *parent = thr->task; + struct gomp_task_depend_entry elem, *ent = NULL; + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; + gomp_mutex_lock (&team->task_lock); + for (i = 0; i < ndepend; i++) + { + elem.addr = depend[i + 2]; + ent = htab_find (parent->depend_hash, &elem); + for (; ent; ent = ent->next) + if (i >= nout && ent->is_in) + continue; + else + break; + if (ent) + break; + } + gomp_mutex_unlock (&team->task_lock); + if (ent) + goto defer; + } + gomp_init_task (&task, thr->task, gomp_icv (false)); task.kind = GOMP_TASK_IFFALSE; task.final_task = (thr->task && thr->task->final_task) || (flags & 2); @@ -146,14 +209,20 @@ GOMP_task (void (*fn) (void *), void *da } else { + defer:; struct gomp_task *task; struct gomp_task *parent = thr->task; struct gomp_taskgroup *taskgroup = parent->taskgroup; char *arg; bool do_wake; + size_t depend_size = 0; - task = gomp_malloc (sizeof (*task) + arg_size + arg_align - 1); - arg = (char *) (((uintptr_t) (task + 1) + arg_align - 1) + if (flags & 8) + depend_size = ((uintptr_t) depend[0] + * sizeof (struct gomp_task_depend_entry)); + task = gomp_malloc (sizeof (*task) + depend_size + + arg_size + arg_align - 1); + arg = (char *) (((uintptr_t) (task + 1) + depend_size + arg_align - 1) & ~(uintptr_t) (arg_align - 1)); gomp_init_task (task, parent, gomp_icv (false)); task->kind = GOMP_TASK_IFFALSE; @@ -171,7 +240,6 @@ GOMP_task (void (*fn) (void *), void *da task->kind = GOMP_TASK_WAITING; task->fn = fn; task->fn_data = arg; - task->in_tied_task = true; task->final_task = (flags & 2) >> 1; gomp_mutex_lock (&team->task_lock); /* If parallel or taskgroup has been cancelled, don't start new @@ -185,6 +253,117 @@ GOMP_task (void (*fn) (void *), void *da free (task); return; } + if (taskgroup) + taskgroup->num_children++; + if (depend_size) + { + size_t ndepend = (uintptr_t) depend[0]; + size_t nout = (uintptr_t) depend[1]; + size_t i; + hash_entry_type ent; + + task->depend_count = ndepend; + task->num_dependees = 0; + if (parent->depend_hash == NULL) + parent->depend_hash + = htab_create (2 * ndepend > 12 ? 2 * ndepend : 12); + for (i = 0; i < ndepend; i++) + { + task->depend[i].addr = depend[2 + i]; + task->depend[i].next = NULL; + task->depend[i].prev = NULL; + task->depend[i].task = task; + task->depend[i].is_in = i >= nout; + task->depend[i].redundant = false; + + hash_entry_type *slot + = htab_find_slot (&parent->depend_hash, &task->depend[i], + INSERT); + hash_entry_type out = NULL; + if (*slot) + { + /* If multiple depends on the same task are the + same, all but the first one are redundant. + As inout/out come first, if any of them is + inout/out, it will win, which is the right + semantics. */ + if ((*slot)->task == task) + { + task->depend[i].redundant = true; + continue; + } + for (ent = *slot; ent; ent = ent->next) + { + /* depend(in:...) doesn't depend on earlier + depend(in:...). */ + if (i >= nout && ent->is_in) + continue; + + if (!ent->is_in) + out = ent; + + struct gomp_task *tsk = ent->task; + if (tsk->dependers == NULL) + { + tsk->dependers + = gomp_malloc (sizeof (struct gomp_dependers_vec) + + 6 * sizeof (struct gomp_task *)); + tsk->dependers->n_elem = 1; + tsk->dependers->allocated = 6; + tsk->dependers->elem[0] = task; + task->num_dependees++; + continue; + } + /* We already have some other dependency on tsk + from earlier depend clause. */ + else if (tsk->dependers->n_elem + && (tsk->dependers->elem[tsk->dependers->n_elem + - 1] + == task)) + continue; + else if (tsk->dependers->n_elem + == tsk->dependers->allocated) + { + tsk->dependers->allocated + = tsk->dependers->allocated * 2 + 2; + tsk->dependers + = gomp_realloc (tsk->dependers, + sizeof (struct gomp_dependers_vec) + + (tsk->dependers->allocated + * sizeof (struct gomp_task *))); + } + tsk->dependers->elem[tsk->dependers->n_elem++] = task; + task->num_dependees++; + } + task->depend[i].next = *slot; + (*slot)->prev = &task->depend[i]; + } + *slot = &task->depend[i]; + + /* There is no need to store more than one depend({,in}out:) + task per address in the hash table chain, because each out + depends on all earlier outs, thus it is enough to record + just the last depend({,in}out:). For depend(in:), we need + to keep all of the previous ones not terminated yet, because + a later depend({,in}out:) might need to depend on all of + them. So, if the new task's clause is depend({,in}out:), + we know there is at most one other depend({,in}out:) clause + in the list (out) and to maintain the invariant we now + need to remove it from the list. */ + if (!task->depend[i].is_in && out) + { + if (out->next) + out->next->prev = out->prev; + out->prev->next = out->next; + out->redundant = true; + } + } + if (task->num_dependees) + { + gomp_mutex_unlock (&team->task_lock); + return; + } + } if (parent->children) { task->next_child = parent->children; @@ -259,12 +438,133 @@ gomp_task_run_pre (struct gomp_task *chi || (taskgroup && taskgroup->cancelled)) && !child_task->copy_ctors_done) return true; - team->task_running_count++; - if (team->task_count == team->task_running_count) - gomp_team_barrier_clear_task_pending (&team->barrier); return false; } +static void +gomp_task_run_post_handle_depend_hash (struct gomp_task *child_task) +{ + struct gomp_task *parent = child_task->parent; + size_t i; + + for (i = 0; i < child_task->depend_count; i++) + if (!child_task->depend[i].redundant) + { + if (child_task->depend[i].next) + child_task->depend[i].next->prev = child_task->depend[i].prev; + if (child_task->depend[i].prev) + child_task->depend[i].prev->next = child_task->depend[i].next; + else + { + hash_entry_type *slot + = htab_find_slot (&parent->depend_hash, &child_task->depend[i], + NO_INSERT); + if (*slot != &child_task->depend[i]) + abort (); + if (child_task->depend[i].next) + *slot = child_task->depend[i].next; + else + htab_clear_slot (parent->depend_hash, slot); + } + } +} + +static size_t +gomp_task_run_post_handle_dependers (struct gomp_task *child_task, + struct gomp_team *team) +{ + struct gomp_task *parent = child_task->parent; + size_t i, count = child_task->dependers->n_elem, ret = 0; + for (i = 0; i < count; i++) + { + struct gomp_task *task = child_task->dependers->elem[i]; + if (--task->num_dependees != 0) + continue; + + struct gomp_taskgroup *taskgroup = task->taskgroup; + if (parent) + { + if (parent->children) + { + task->next_child = parent->children; + task->prev_child = parent->children->prev_child; + task->next_child->prev_child = task; + task->prev_child->next_child = task; + } + else + { + task->next_child = task; + task->prev_child = task; + } + parent->children = task; + if (parent->in_taskwait) + { + parent->in_taskwait = false; + gomp_sem_post (&parent->taskwait_sem); + } + } + if (taskgroup) + { + if (taskgroup->children) + { + task->next_taskgroup = taskgroup->children; + task->prev_taskgroup = taskgroup->children->prev_taskgroup; + task->next_taskgroup->prev_taskgroup = task; + task->prev_taskgroup->next_taskgroup = task; + } + else + { + task->next_taskgroup = task; + task->prev_taskgroup = task; + } + taskgroup->children = task; + if (taskgroup->in_taskgroup_wait) + { + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); + } + } + if (team->task_queue) + { + task->next_queue = team->task_queue; + task->prev_queue = team->task_queue->prev_queue; + task->next_queue->prev_queue = task; + task->prev_queue->next_queue = task; + } + else + { + task->next_queue = task; + task->prev_queue = task; + team->task_queue = task; + } + ++team->task_count; + ++ret; + } + free (child_task->dependers); + child_task->dependers = NULL; + if (ret > 1) + gomp_team_barrier_set_task_pending (&team->barrier); + return ret; +} + +static inline size_t +gomp_task_run_post_handle_depend (struct gomp_task *child_task, + struct gomp_team *team) +{ + if (child_task->depend_count == 0) + return 0; + + /* If parent is gone already, the hash table is freed and nothing + will use the hash table anymore, no need to remove anything from it. */ + if (child_task->parent != NULL) + gomp_task_run_post_handle_depend_hash (child_task); + + if (child_task->dependers == NULL) + return 0; + + return gomp_task_run_post_handle_dependers (child_task, team); +} + static inline void gomp_task_run_post_remove_parent (struct gomp_task *child_task) { @@ -286,7 +586,10 @@ gomp_task_run_post_remove_parent (struct before the NULL is written. */ __atomic_store_n (&parent->children, NULL, MEMMODEL_RELEASE); if (parent->in_taskwait) - gomp_sem_post (&parent->taskwait_sem); + { + parent->in_taskwait = false; + gomp_sem_post (&parent->taskwait_sem); + } } } @@ -298,20 +601,29 @@ gomp_task_run_post_remove_taskgroup (str return; child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; - if (taskgroup->children != child_task) - return; - if (child_task->next_taskgroup != child_task) - taskgroup->children = child_task->next_taskgroup; + if (taskgroup->num_children > 1) + --taskgroup->num_children; else { - /* We access task->children in GOMP_taskgroup_end + /* We access taskgroup->num_children in GOMP_taskgroup_end outside of the task lock mutex region, so need a release barrier here to ensure memory written by child_task->fn above is flushed before the NULL is written. */ - __atomic_store_n (&taskgroup->children, NULL, MEMMODEL_RELEASE); + __atomic_store_n (&taskgroup->num_children, 0, MEMMODEL_RELEASE); + } + if (taskgroup->children != child_task) + return; + if (child_task->next_taskgroup != child_task) + taskgroup->children = child_task->next_taskgroup; + else + { + taskgroup->children = NULL; if (taskgroup->in_taskgroup_wait) - gomp_sem_post (&taskgroup->taskgroup_sem); + { + taskgroup->in_taskgroup_wait = false; + gomp_sem_post (&taskgroup->taskgroup_sem); + } } } @@ -323,6 +635,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ struct gomp_task *task = thr->task; struct gomp_task *child_task = NULL; struct gomp_task *to_free = NULL; + int do_wake = 0; gomp_mutex_lock (&team->task_lock); if (gomp_barrier_last_thread (state)) @@ -355,8 +668,17 @@ gomp_barrier_handle_tasks (gomp_barrier_ } goto finish_cancelled; } + team->task_running_count++; + child_task->in_tied_task = true; + if (team->task_count == team->task_running_count) + gomp_team_barrier_clear_task_pending (&team->barrier); } gomp_mutex_unlock (&team->task_lock); + if (do_wake) + { + gomp_team_barrier_wake (&team->barrier, do_wake); + do_wake = 0; + } if (to_free) { gomp_finish_task (to_free); @@ -374,7 +696,9 @@ gomp_barrier_handle_tasks (gomp_barrier_ gomp_mutex_lock (&team->task_lock); if (child_task) { - finish_cancelled: + finish_cancelled:; + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); gomp_task_run_post_remove_parent (child_task); gomp_clear_parent (child_task->children); gomp_task_run_post_remove_taskgroup (child_task); @@ -382,6 +706,12 @@ gomp_barrier_handle_tasks (gomp_barrier_ child_task = NULL; if (!cancelled) team->task_running_count--; + if (new_tasks > 1) + { + do_wake = team->nthreads - team->task_running_count; + if (do_wake > new_tasks) + do_wake = new_tasks; + } if (--team->task_count == 0 && gomp_team_barrier_waiting_for_tasks (&team->barrier)) { @@ -404,9 +734,10 @@ GOMP_taskwait (void) struct gomp_task *task = thr->task; struct gomp_task *child_task = NULL; struct gomp_task *to_free = NULL; + int do_wake = 0; /* The acquire barrier on load of task->children here synchronizes - with the write of a NULL in gomp_barrier_handle_tasks. It is + with the write of a NULL in gomp_task_run_post_remove_parent. It is not necessary that we synchronize with other non-NULL writes at this point, but we must ensure that all writes to memory by a child thread task work function are seen before we exit from @@ -451,6 +782,11 @@ GOMP_taskwait (void) in other threads. Wait for them. */ task->in_taskwait = true; gomp_mutex_unlock (&team->task_lock); + if (do_wake) + { + gomp_team_barrier_wake (&team->barrier, do_wake); + do_wake = 0; + } if (to_free) { gomp_finish_task (to_free); @@ -464,15 +800,13 @@ GOMP_taskwait (void) thr->task = task; } else - { - gomp_sem_wait (&task->taskwait_sem); - task->in_taskwait = false; - return; - } + gomp_sem_wait (&task->taskwait_sem); gomp_mutex_lock (&team->task_lock); if (child_task) { - finish_cancelled: + finish_cancelled:; + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); child_task->prev_child->next_child = child_task->next_child; child_task->next_child->prev_child = child_task->prev_child; if (task->children == child_task) @@ -487,7 +821,13 @@ GOMP_taskwait (void) to_free = child_task; child_task = NULL; team->task_count--; - team->task_running_count--; + if (new_tasks > 1) + { + do_wake = team->nthreads - team->task_running_count + - !task->in_tied_task; + if (do_wake > new_tasks) + do_wake = new_tasks; + } } } } @@ -519,6 +859,7 @@ GOMP_taskgroup_start (void) taskgroup->children = NULL; taskgroup->in_taskgroup_wait = false; taskgroup->cancelled = false; + taskgroup->num_children = 0; gomp_sem_init (&taskgroup->taskgroup_sem, 0); task->taskgroup = taskgroup; } @@ -532,18 +873,29 @@ GOMP_taskgroup_end (void) struct gomp_taskgroup *taskgroup; struct gomp_task *child_task = NULL; struct gomp_task *to_free = NULL; + int do_wake = 0; if (team == NULL) return; taskgroup = task->taskgroup; - if (__atomic_load_n (&taskgroup->children, MEMMODEL_ACQUIRE) == NULL) + + /* The acquire barrier on load of taskgroup->num_children here + synchronizes with the write of 0 in gomp_task_run_post_remove_taskgroup. + It is not necessary that we synchronize with other non-0 writes at + this point, but we must ensure that all writes to memory by a + child thread task work function are seen before we exit from + GOMP_taskgroup_end. */ + if (__atomic_load_n (&taskgroup->num_children, MEMMODEL_ACQUIRE) == 0) goto finish; + gomp_mutex_lock (&team->task_lock); while (1) { bool cancelled = false; if (taskgroup->children == NULL) { + if (taskgroup->num_children) + goto do_wait; gomp_mutex_unlock (&team->task_lock); if (to_free) { @@ -570,10 +922,18 @@ GOMP_taskgroup_end (void) } } else - /* All tasks we are waiting for are already running - in other threads. Wait for them. */ - taskgroup->in_taskgroup_wait = true; + { + do_wait: + /* All tasks we are waiting for are already running + in other threads. Wait for them. */ + taskgroup->in_taskgroup_wait = true; + } gomp_mutex_unlock (&team->task_lock); + if (do_wake) + { + gomp_team_barrier_wake (&team->barrier, do_wake); + do_wake = 0; + } if (to_free) { gomp_finish_task (to_free); @@ -587,19 +947,18 @@ GOMP_taskgroup_end (void) thr->task = task; } else - { - gomp_sem_wait (&taskgroup->taskgroup_sem); - taskgroup->in_taskgroup_wait = false; - goto finish; - } + gomp_sem_wait (&taskgroup->taskgroup_sem); gomp_mutex_lock (&team->task_lock); if (child_task) { - finish_cancelled: + finish_cancelled:; + size_t new_tasks + = gomp_task_run_post_handle_depend (child_task, team); child_task->prev_taskgroup->next_taskgroup = child_task->next_taskgroup; child_task->next_taskgroup->prev_taskgroup = child_task->prev_taskgroup; + --taskgroup->num_children; if (taskgroup->children == child_task) { if (child_task->next_taskgroup != child_task) @@ -612,7 +971,13 @@ GOMP_taskgroup_end (void) to_free = child_task; child_task = NULL; team->task_count--; - team->task_running_count--; + if (new_tasks > 1) + { + do_wake = team->nthreads - team->task_running_count + - !task->in_tied_task; + if (do_wake > new_tasks) + do_wake = new_tasks; + } } } --- libgomp/testsuite/libgomp.c/depend-4.c.jj 2013-09-27 11:42:37.283473918 +0200 +++ libgomp/testsuite/libgomp.c/depend-4.c 2013-09-27 11:49:37.781239095 +0200 @@ -0,0 +1,56 @@ +#include <stdlib.h> +#include <unistd.h> + +int +main () +{ + #pragma omp parallel + #pragma omp single + { + int x = 1, y = 2, z = 3; + #pragma omp taskgroup + { + #pragma omp task shared (x, y, z) depend(inout: x, y) \ + depend (in: z) if (x > 10) + { + if (x != 1 || y != 2 || z != 3) + abort (); + x = 4; + y = 5; + } + /* The above task has depend clauses, but no dependencies + on earlier tasks, and is if (0), so must be scheduled + immediately. */ + if (x != 4 || y != 5) + abort (); + } + #pragma omp taskgroup + { + #pragma omp task shared (x, y) depend(in: x, y) + { + usleep (10000); + if (x != 4 || y != 5 || z != 3) + abort (); + } + #pragma omp task shared (x, y) depend(in: x, y) + { + usleep (10000); + if (x != 4 || y != 5 || z != 3) + abort (); + } + #pragma omp task shared (x, y, z) depend(inout: x, y) \ + depend (in: z) if (x > 10) + { + if (x != 4 || y != 5 || z != 3) + abort (); + x = 6; + y = 7; + } + /* The above task has depend clauses, and may have dependencies + on earlier tasks, while it is if (0), it can be deferred. */ + } + if (x != 6 || y != 7) + abort (); + } + return 0; +} --- libgomp/testsuite/libgomp.c/depend-1.c.jj 2013-09-26 17:57:26.011983435 +0200 +++ libgomp/testsuite/libgomp.c/depend-1.c 2013-09-26 21:09:57.128895308 +0200 @@ -0,0 +1,215 @@ +#include <stdlib.h> + +void +dep (void) +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + } +} + +void +dep2 (void) +{ + #pragma omp parallel + #pragma omp single + { + int x = 1; + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp taskwait + } +} + +void +dep3 (void) +{ + #pragma omp parallel + { + int x = 1; + #pragma omp single + { + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + } + } +} + +void +firstpriv (void) +{ + #pragma omp parallel + #pragma omp single + { + int x = 1; + #pragma omp task depend(out: x) + x = 2; + #pragma omp task depend(in: x) + if (x != 1) + abort (); + } +} + +void +antidep (void) +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared(x) depend(in: x) + if (x != 1) + abort (); + #pragma omp task shared(x) depend(out: x) + x = 2; + } +} + +void +antidep2 (void) +{ + #pragma omp parallel + #pragma omp single + { + int x = 1; + #pragma omp taskgroup + { + #pragma omp task shared(x) depend(in: x) + if (x != 1) + abort (); + #pragma omp task shared(x) depend(out: x) + x = 2; + } + } +} + +void +antidep3 (void) +{ + #pragma omp parallel + { + int x = 1; + #pragma omp single + { + #pragma omp task shared(x) depend(in: x) + if (x != 1) + abort (); + #pragma omp task shared(x) depend(out: x) + x = 2; + } + } +} + + +void +outdep (void) +{ + #pragma omp parallel + #pragma omp single + { + int x = 0; + #pragma omp task shared(x) depend(out: x) + x = 1; + #pragma omp task shared(x) depend(out: x) + x = 2; + #pragma omp taskwait + if (x != 2) + abort (); + } +} + +void +concurrent (void) +{ + int x = 1; + #pragma omp parallel + #pragma omp single + { + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + } +} + +void +concurrent2 (void) +{ + #pragma omp parallel + #pragma omp single + { + int x = 1; + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp taskwait + } +} + +void +concurrent3 (void) +{ + #pragma omp parallel + { + int x = 1; + #pragma omp single + { + #pragma omp task shared (x) depend(out: x) + x = 2; + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + #pragma omp task shared (x) depend(in: x) + if (x != 2) + abort (); + } + } +} + +int +main () +{ + dep (); + dep2 (); + dep3 (); + firstpriv (); + antidep (); + antidep2 (); + antidep3 (); + outdep (); + concurrent (); + concurrent2 (); + concurrent3 (); + return 0; +} --- libgomp/testsuite/libgomp.c/depend-2.c.jj 2013-09-26 18:56:19.808294100 +0200 +++ libgomp/testsuite/libgomp.c/depend-2.c 2013-09-26 19:46:29.732123749 +0200 @@ -0,0 +1,71 @@ +#include <stdlib.h> +#include <unistd.h> + +void +foo (int do_sleep) +{ + int a[64], i, *p = a + 4, x = 0; + asm volatile ("" : "+r" (p)); + for (i = 0; i < 64; i++) + a[i] = i + 8; + #pragma omp parallel private (i) + { + #pragma omp single nowait + { + for (i = 0; i < 8; i++) + { + #pragma omp task depend(out: a[i * 8 : 4]) + a[i * 8] += (i + 2) * 9; + #pragma omp task depend(out: p[i * 8 : 2]) + p[i * 8] += (i + 3) * 10; + #pragma omp task depend(out: x) + x = 1; + } + for (i = 0; i < 8; i++) + #pragma omp task depend(in: a[i * 8 : 4]) \ + depend(inout: a[i * 8 + 4 : 2]) \ + depend(in: a[0 : 4]) depend(in: x) + { + if (a[0] != 8 + 2 * 9 || x != 1) + abort (); + if (a[i * 8] != i * 8 + 8 + (i + 2) * 9) + abort (); + if (a[4 + i * 8] != 4 + i * 8 + 8 + (i + 3) * 10) + abort (); + p[i * 8] += a[i * 8]; + } + for (i = 0; i < 8; i++) + #pragma omp task depend(inout: a[i * 8 : 4]) \ + depend(in: p[i * 8 : 2]) \ + depend(in: p[0 : 2], x) + { + if (p[0] != 4 + 8 + 3 * 10 + 0 + 8 + 2 * 9 || x != 1) + abort (); + if (a[i * 8] != i * 8 + 8 + (i + 2) * 9) + abort (); + if (a[4 + i * 8] != (4 + i * 8 + 8 + (i + 3) * 10 + + i * 8 + 8 + (i + 2) * 9)) + abort (); + a[i * 8] += 2; + } + for (i = 0; i < 4; i++) + #pragma omp task depend(in: a[i * 16 : 4], a[i * 16 + 8 : 4], x) + { + if (a[i * 16] != i * 16 + 8 + (2 * i + 2) * 9 + 2 || x != 1) + abort (); + if (p[i * 16 + 4] != i * 16 + 8 + 8 + (2 * i + 1 + 2) * 9 + 2) + abort (); + } + } + if (do_sleep) + sleep (1); + } +} + +int +main () +{ + foo (1); + foo (0); + return 0; +} --- libgomp/testsuite/libgomp.c/depend-3.c.jj 2013-09-27 11:32:44.410621977 +0200 +++ libgomp/testsuite/libgomp.c/depend-3.c 2013-09-27 11:39:25.500493830 +0200 @@ -0,0 +1,51 @@ +#include <stdlib.h> +#include <unistd.h> + +int +main () +{ + #pragma omp parallel + #pragma omp single + { + int x = 1, y = 2; + #pragma omp taskgroup + { + #pragma omp task shared (x) depend(in: x) + { + usleep (10000); + if (x != 1) + abort (); + } + #pragma omp taskgroup + { + #pragma omp task shared (x) depend(in: x) + { + usleep (15000); + if (x != 1) + abort (); + } + #pragma omp task shared (y) depend(inout: y) + { + if (y != 2) + abort (); + y = 3; + } + #pragma omp taskgroup + { + #pragma omp task shared (x) depend(in: x) + { + usleep (13000); + if (x != 1) + abort (); + } + #pragma omp taskgroup + { + #pragma omp task shared (x) depend(out: x) + x = 2; + } + } + } + } + } + return 0; +} Jakub