Try to recycle the last non-nested team to avoid the use of malloc() and free() in the normal case where the number of threads is the same. Avoid superfluous destruction and initialization of team synchronization objects.
Using the microbenchmark posted here https://gcc.gnu.org/ml/gcc-patches/2008-03/msg00930.html shows an improvement in the parallel bench test case (target x86_64-unknown-linux-gnu, median out of 9 test runs, iteration count increased to 200000). Before the patch: parallel bench 11.2284 seconds After the patch: parallel bench 10.5912 seconds libgomp/ChangeLog 2015-07-13 Sebastian Huber <sebastian.hu...@embedded-brains.de> * team.c (get_recycable_team): New. (gomp_new_team): Recycle last non-nested team if possible. (free_team): Destroy more team synchronization objects. (gomp_team_end): Move some team synchronization object destructions to free_team(). --- libgomp/team.c | 54 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/libgomp/team.c b/libgomp/team.c index b98b233..0bcbaf8 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -134,6 +134,25 @@ gomp_thread_start (void *xdata) return NULL; } +static struct gomp_team * +get_recycable_team (unsigned nthreads) +{ + struct gomp_thread *thr = gomp_thread (); + if (thr->ts.team == NULL) + { + struct gomp_thread_pool *pool = thr->thread_pool; + if (pool != NULL) + { + struct gomp_team *last_team = pool->last_team; + if (last_team != NULL && last_team->nthreads == nthreads) + { + pool->last_team = NULL; + return last_team; + } + } + } + return NULL; +} /* Create a new team data structure. */ @@ -141,18 +160,28 @@ struct gomp_team * gomp_new_team (unsigned nthreads) { struct gomp_team *team; - size_t size; int i; - size = sizeof (*team) + nthreads * (sizeof (team->ordered_release[0]) - + sizeof (team->implicit_task[0])); - team = gomp_malloc (size); + team = get_recycable_team (nthreads); + if (team == NULL) + { + size_t extra = sizeof (team->ordered_release[0]) + + sizeof (team->implicit_task[0]); + team = gomp_malloc (sizeof (*team) + nthreads * extra); + +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_init (&team->work_share_list_free_lock); +#endif + gomp_barrier_init (&team->barrier, nthreads); + gomp_sem_init (&team->master_release, 0); + gomp_mutex_init (&team->task_lock); + + team->nthreads = nthreads; + } team->work_share_chunk = 8; #ifdef HAVE_SYNC_BUILTINS team->single_count = 0; -#else - gomp_mutex_init (&team->work_share_list_free_lock); #endif team->work_shares_to_free = &team->work_shares[0]; gomp_init_work_share (&team->work_shares[0], false, nthreads); @@ -163,14 +192,9 @@ gomp_new_team (unsigned nthreads) team->work_shares[i].next_free = &team->work_shares[i + 1]; team->work_shares[i].next_free = NULL; - team->nthreads = nthreads; - gomp_barrier_init (&team->barrier, nthreads); - - gomp_sem_init (&team->master_release, 0); team->ordered_release = (void *) &team->implicit_task[nthreads]; team->ordered_release[0] = &team->master_release; - gomp_mutex_init (&team->task_lock); team->task_queue = NULL; team->task_count = 0; team->task_queued_count = 0; @@ -187,6 +211,10 @@ gomp_new_team (unsigned nthreads) static void free_team (struct gomp_team *team) { + gomp_sem_destroy (&team->master_release); +#ifndef HAVE_SYNC_BUILTINS + gomp_mutex_destroy (&team->work_share_list_free_lock); +#endif gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); free (team); @@ -894,10 +922,6 @@ gomp_team_end (void) } while (ws != NULL); } - gomp_sem_destroy (&team->master_release); -#ifndef HAVE_SYNC_BUILTINS - gomp_mutex_destroy (&team->work_share_list_free_lock); -#endif if (__builtin_expect (thr->ts.team != NULL, 0) || __builtin_expect (team->nthreads == 1, 0)) -- 1.8.4.5