Hi! The following patch implements what I've been talking about earlier, honor that for explicit num_teams clause we create at least the lower-bound (if not specified, upper-bound) teams in the league. For host fallback, it still means we only have one thread doing all the teams, sequentially one after another. For PTX and GCN, I think the new teams-2.c test and maybe teams-4.c too will or might fail. For these offloads, I think it is ok to remove symbols no longer used from libgomp.a. If num_teams_lower is bigger than the provided num_blocks or num_workgroups, we should arrange for gomp_num_teams_var to be num_teams_lower - 1, stop using the %ctaid.x or __builtin_gcn_dim_pos (0) for omp_get_team_num () and instead use for it some .shared var that GOMP_teams4 initializes to %ctaid.x or __builtin_gcn_dim_pos (0) when first and for !first increment that by num_blocks or num_workgroups each time and only return false when we are above num_teams_lower. Any help with actually implementing this for the 2 architectures highly appreciated.
I have only lightly tested the patch so far, will bootstrap/regtest it overnight without offloading. 2021-11-11 Jakub Jelinek <ja...@redhat.com> gcc/ * omp-builtins.def (BUILT_IN_GOMP_TEAMS): Remove. (BUILT_IN_GOMP_TEAMS4): New. * builtin-types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. * omp-low.c (lower_omp_teams): Use GOMP_teams4 instead of GOMP_teams, pass to it also num_teams lower-bound expression or a dup of upper-bound if it is missing and a flag whether it is the first call or not. gcc/fortran/ * types.def (BT_FN_VOID_UINT_UINT): Remove. (BT_FN_BOOL_UINT_UINT_UINT_BOOL): New. libgomp/ * libgomp_g.h (GOMP_teams4): Declare. * libgomp.map (GOMP_5.1): Export GOMP_teams4. * target.c (GOMP_teams4): New function. * config/nvptx/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * config/gcn/target.c (GOMP_teams): Remove. (GOMP_teams4): New function. * testsuite/libgomp.c/teams-4.c (main): Expect exactly 2 teams instead of <= 2. * testsuite/libgomp.c-c++-common/teams-2.c: New test. --- gcc/omp-builtins.def.jj 2021-08-20 11:36:30.961244658 +0200 +++ gcc/omp-builtins.def 2021-11-11 17:53:44.092433139 +0100 @@ -442,8 +442,8 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_U DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_ENTER_EXIT_DATA, "GOMP_target_enter_exit_data", BT_FN_VOID_INT_SIZE_PTR_PTR_PTR_UINT_PTR, ATTR_NOTHROW_LIST) -DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS, "GOMP_teams", - BT_FN_VOID_UINT_UINT, ATTR_NOTHROW_LIST) +DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS4, "GOMP_teams4", + BT_FN_BOOL_UINT_UINT_UINT_BOOL, ATTR_NOTHROW_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS_REG, "GOMP_teams_reg", BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT, ATTR_NOTHROW_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TASKGROUP_REDUCTION_REGISTER, --- gcc/builtin-types.def.jj 2021-02-04 18:15:05.253113955 +0100 +++ gcc/builtin-types.def 2021-11-11 17:54:12.693023370 +0100 @@ -489,7 +489,6 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_VPTR_INT DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE, BT_CONST_VOLATILE_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL) -DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT) DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE) @@ -680,6 +679,8 @@ DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULL BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_4 (BT_FN_VOID_UINT_PTR_INT_PTR, BT_VOID, BT_INT, BT_PTR, BT_INT, BT_PTR) +DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_UINT_UINT_BOOL, + BT_BOOL, BT_UINT, BT_UINT, BT_UINT, BT_BOOL) DEF_FUNCTION_TYPE_5 (BT_FN_INT_STRING_INT_SIZE_CONST_STRING_VALIST_ARG, BT_INT, BT_STRING, BT_INT, BT_SIZE, BT_CONST_STRING, --- gcc/omp-low.c.jj 2021-11-11 14:35:37.633348092 +0100 +++ gcc/omp-low.c 2021-11-11 19:19:43.940085275 +0100 @@ -13902,14 +13902,24 @@ lower_omp_teams (gimple_stmt_iterator *g tree num_teams = omp_find_clause (gimple_omp_teams_clauses (teams_stmt), OMP_CLAUSE_NUM_TEAMS); + tree num_teams_lower = NULL_TREE; if (num_teams == NULL_TREE) num_teams = build_int_cst (unsigned_type_node, 0); else { + num_teams_lower = OMP_CLAUSE_NUM_TEAMS_LOWER_EXPR (num_teams); + if (num_teams_lower) + { + num_teams_lower = fold_convert (unsigned_type_node, num_teams_lower); + gimplify_expr (&num_teams_lower, &bind_body, NULL, is_gimple_val, + fb_rvalue); + } num_teams = OMP_CLAUSE_NUM_TEAMS_UPPER_EXPR (num_teams); num_teams = fold_convert (unsigned_type_node, num_teams); gimplify_expr (&num_teams, &bind_body, NULL, is_gimple_val, fb_rvalue); } + if (num_teams_lower == NULL_TREE) + num_teams_lower = num_teams; tree thread_limit = omp_find_clause (gimple_omp_teams_clauses (teams_stmt), OMP_CLAUSE_THREAD_LIMIT); if (thread_limit == NULL_TREE) @@ -13921,6 +13931,29 @@ lower_omp_teams (gimple_stmt_iterator *g gimplify_expr (&thread_limit, &bind_body, NULL, is_gimple_val, fb_rvalue); } + location_t loc = gimple_location (teams_stmt); + tree first = create_tmp_var (boolean_type_node); + gimple_seq_add_stmt (&bind_body, + gimple_build_assign (first, boolean_true_node)); + tree llabel = create_artificial_label (loc); + gimple_seq_add_stmt (&bind_body, gimple_build_label (llabel)); + tree decl = builtin_decl_explicit (BUILT_IN_GOMP_TEAMS4); + gimple *call + = gimple_build_call (decl, 4, num_teams_lower, num_teams, thread_limit, + first); + gimple_set_location (call, loc); + tree temp = create_tmp_var (boolean_type_node); + gimple_call_set_lhs (call, temp); + gimple_seq_add_stmt (&bind_body, call); + + tree tlabel = create_artificial_label (loc); + tree flabel = create_artificial_label (loc); + gimple *cond = gimple_build_cond (NE_EXPR, temp, boolean_false_node, + tlabel, flabel); + gimple_seq_add_stmt (&bind_body, cond); + gimple_seq_add_stmt (&bind_body, gimple_build_label (tlabel)); + gimple_seq_add_stmt (&bind_body, + gimple_build_assign (first, boolean_false_node)); lower_rec_input_clauses (gimple_omp_teams_clauses (teams_stmt), &bind_body, &dlist, ctx, NULL); @@ -13929,17 +13962,13 @@ lower_omp_teams (gimple_stmt_iterator *g NULL, ctx); gimple_seq_add_stmt (&bind_body, teams_stmt); - location_t loc = gimple_location (teams_stmt); - tree decl = builtin_decl_explicit (BUILT_IN_GOMP_TEAMS); - gimple *call = gimple_build_call (decl, 2, num_teams, thread_limit); - gimple_set_location (call, loc); - gimple_seq_add_stmt (&bind_body, call); - gimple_seq_add_seq (&bind_body, gimple_omp_body (teams_stmt)); gimple_omp_set_body (teams_stmt, NULL); gimple_seq_add_seq (&bind_body, olist); gimple_seq_add_seq (&bind_body, dlist); gimple_seq_add_stmt (&bind_body, gimple_build_omp_return (true)); + gimple_seq_add_stmt (&bind_body, gimple_build_goto (llabel)); + gimple_seq_add_stmt (&bind_body, gimple_build_label (flabel)); gimple_bind_set_body (bind, bind_body); pop_gimplify_context (bind); --- gcc/fortran/types.def.jj 2021-08-20 11:36:30.969244547 +0200 +++ gcc/fortran/types.def 2021-11-11 17:54:35.838691751 +0100 @@ -117,7 +117,6 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_VPTR_INT DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE, BT_CONST_VOLATILE_PTR) DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL) -DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTR_PTRMODE, BT_VOID, BT_PTR, BT_PTRMODE) DEF_FUNCTION_TYPE_2 (BT_FN_VOID_CONST_PTR_SIZE, BT_VOID, BT_CONST_PTR, BT_SIZE) @@ -173,6 +172,8 @@ DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_ULL BT_PTR_ULONGLONG) DEF_FUNCTION_TYPE_4 (BT_FN_VOID_UINT_PTR_INT_PTR, BT_VOID, BT_INT, BT_PTR, BT_INT, BT_PTR) +DEF_FUNCTION_TYPE_4 (BT_FN_BOOL_UINT_UINT_UINT_BOOL, + BT_BOOL, BT_UINT, BT_UINT, BT_UINT, BT_BOOL) DEF_FUNCTION_TYPE_5 (BT_FN_VOID_OMPFN_PTR_UINT_UINT_UINT, BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, BT_UINT, BT_UINT, --- libgomp/libgomp_g.h.jj 2021-08-20 11:36:30.970244532 +0200 +++ libgomp/libgomp_g.h 2021-11-11 17:52:35.342418147 +0100 @@ -355,6 +355,7 @@ extern void GOMP_target_enter_exit_data unsigned short *, unsigned int, void **); extern void GOMP_teams (unsigned int, unsigned int); +extern bool GOMP_teams4 (unsigned int, unsigned int, unsigned int, bool); /* teams.c */ --- libgomp/libgomp.map.jj 2021-10-11 12:20:21.926063118 +0200 +++ libgomp/libgomp.map 2021-11-11 19:45:27.472110004 +0100 @@ -399,6 +399,7 @@ GOMP_5.1 { GOMP_error; GOMP_scope_start; GOMP_warning; + GOMP_teams4; } GOMP_5.0.1; OACC_2.0 { --- libgomp/target.c.jj 2021-10-13 09:55:49.777781910 +0200 +++ libgomp/target.c 2021-11-11 18:00:17.465797133 +0100 @@ -3088,6 +3088,32 @@ GOMP_teams (unsigned int num_teams, unsi (void) num_teams; } +bool +GOMP_teams4 (unsigned int num_teams_low, unsigned int num_teams_high, + unsigned int thread_limit, bool first) +{ + struct gomp_thread *thr = gomp_thread (); + if (first) + { + if (thread_limit) + { + struct gomp_task_icv *icv = gomp_icv (true); + icv->thread_limit_var + = thread_limit > INT_MAX ? UINT_MAX : thread_limit; + } + (void) num_teams_high; + if (num_teams_low == 0) + num_teams_low = 1; + thr->num_teams = num_teams_low - 1; + thr->team_num = 0; + } + else if (thr->team_num == thr->num_teams) + return false; + else + ++thr->team_num; + return true; +} + void * omp_target_alloc (size_t size, int device_num) { --- libgomp/config/nvptx/target.c.jj 2021-05-26 11:28:42.064386868 +0200 +++ libgomp/config/nvptx/target.c 2021-11-11 19:16:46.368611557 +0100 @@ -26,9 +26,12 @@ #include "libgomp.h" #include <limits.h> -void -GOMP_teams (unsigned int num_teams, unsigned int thread_limit) +bool +GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper, + unsigned int thread_limit, bool first) { + if (!first) + return false; if (thread_limit) { struct gomp_task_icv *icv = gomp_icv (true); @@ -38,14 +41,15 @@ GOMP_teams (unsigned int num_teams, unsi unsigned int num_blocks, block_id; asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks)); asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id)); - if (!num_teams || num_teams >= num_blocks) - num_teams = num_blocks; - else if (block_id >= num_teams) - { - gomp_free_thread (nvptx_thrs); - asm ("exit;"); - } - gomp_num_teams_var = num_teams - 1; + /* FIXME: If num_teams_lower > num_blocks, we want to loop multiple + times for some CTAs. */ + (void) num_teams_lower; + if (!num_teams_upper || num_teams_upper >= num_blocks) + num_teams_upper = num_blocks; + else if (block_id >= num_teams_upper) + return false; + gomp_num_teams_var = num_teams_upper - 1; + return true; } int --- libgomp/config/gcn/target.c.jj 2021-05-26 11:28:42.064386868 +0200 +++ libgomp/config/gcn/target.c 2021-11-11 19:18:35.873053653 +0100 @@ -26,9 +26,12 @@ #include "libgomp.h" #include <limits.h> -void -GOMP_teams (unsigned int num_teams, unsigned int thread_limit) +bool +GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper, + unsigned int thread_limit, bool first) { + if (!first) + return false; if (thread_limit) { struct gomp_task_icv *icv = gomp_icv (true); @@ -38,14 +41,15 @@ GOMP_teams (unsigned int num_teams, unsi unsigned int num_workgroups, workgroup_id; num_workgroups = __builtin_gcn_dim_size (0); workgroup_id = __builtin_gcn_dim_pos (0); - if (!num_teams || num_teams >= num_workgroups) - num_teams = num_workgroups; - else if (workgroup_id >= num_teams) - { - gomp_free_thread (gcn_thrs ()); - exit (0); - } - gomp_num_teams_var = num_teams - 1; + /* FIXME: If num_teams_lower > num_workgroups, we want to loop + multiple times at least for some workgroups. */ + (void) num_teams_lower; + if (!num_teams_upper || num_teams_upper >= num_workgroups) + num_teams_upper = num_workgroups; + else if (workgroup_id >= num_teams_upper) + return false; + gomp_num_teams_var = num_teams_upper - 1; + return true; } int --- libgomp/testsuite/libgomp.c/teams-4.c.jj 2021-11-11 14:35:37.700347127 +0100 +++ libgomp/testsuite/libgomp.c/teams-4.c 2021-11-11 19:27:07.198779107 +0100 @@ -20,7 +20,7 @@ main () #pragma omp parallel if (0) #pragma omp target #pragma omp teams num_teams (2) - if (omp_get_num_teams () > 2 + if (omp_get_num_teams () != 2 || (unsigned) omp_get_team_num () >= 2U) abort (); if (omp_get_num_teams () != 4 || (unsigned) team >= 4U) --- libgomp/testsuite/libgomp.c-c++-common/teams-2.c.jj 2021-11-11 19:28:33.066557475 +0100 +++ libgomp/testsuite/libgomp.c-c++-common/teams-2.c 2021-11-11 19:44:03.837301688 +0100 @@ -0,0 +1,70 @@ +#include <omp.h> +#include <stdlib.h> + +int +foo () +{ + return 934; +} + +int +main () +{ + int a[934] = {}; + int k, e; + #pragma omp target map(a) + #pragma omp teams num_teams (foo ()) + { + int i = omp_get_team_num (); + if (omp_get_num_teams () != 934 + || (unsigned) i >= 934U + || a[i] != 0) + abort (); + ++a[i]; + } + #pragma omp target map(a) + #pragma omp teams num_teams (foo () - 50 : foo ()) + { + int i = omp_get_team_num (); + int j = omp_get_num_teams (); + if (j < 884 + || j > 934 + || (unsigned) i >= (unsigned) j + || a[i] != 1) + abort (); + ++a[i]; + } + #pragma omp target teams map(a) num_teams (foo () / 2) + { + int i = omp_get_team_num (); + if (omp_get_num_teams () != 467 + || (unsigned) i >= 467U + || a[i] != 2) + abort (); + ++a[i]; + } + #pragma omp target teams map(a) num_teams (foo () / 2 - 50 : foo () / 2) + { + int i = omp_get_team_num (); + int j = omp_get_num_teams (); + if (j < 417 + || j > 467 + || (unsigned) i >= (unsigned) j + || a[i] != 3) + abort (); + ++a[i]; + } + e = 4; + for (k = 0; k < 934; k++) + { + if (k >= 417 && k < 467 && a[k] == 3) + e = 3; + else if (k == 467) + e = 2; + else if (k >= 884 && a[k] == 1) + e = 1; + if (a[k] != e) + abort (); + } + return 0; +} Jakub