Hi! With strict: modifier on these clauses, the standard is explicit about how many iterations (and which) each generated task of taskloop directive should contain. For num_tasks it actually matches what we were already implementing, but for grainsize it does not (and even violates the old rule - without strict it requires that the number of iterations (unspecified which exactly) handled by each generated task is >= grainsize argument and < 2 * grainsize argument, with strict: it requires that each generated task handles exactly == grainsize argument iterations, except for the generated task handling the last iteration which can handles <= grainsize iterations).
The following patch implements it for C and C++. Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk. 2021-08-23 Jakub Jelinek <ja...@redhat.com> gcc/ * tree.h (OMP_CLAUSE_GRAINSIZE_STRICT): Define. (OMP_CLAUSE_NUM_TASKS_STRICT): Define. * tree-pretty-print.c (dump_omp_clause) <case OMP_CLAUSE_GRAINSIZE, case OMP_CLAUSE_NUM_TASKS>: Print strict: modifier. * omp-expand.c (expand_task_call): Use GOMP_TASK_FLAG_STRICT in iflags if either grainsize or num_tasks clause has the strict modifier. gcc/c/ * c-parser.c (c_parser_omp_clause_num_tasks, c_parser_omp_clause_grainsize): Parse the optional strict: modifier. gcc/cp/ * parser.c (cp_parser_omp_clause_num_tasks, cp_parser_omp_clause_grainsize): Parse the optional strict: modifier. include/ * gomp-constants.h (GOMP_TASK_FLAG_STRICT): Define. libgomp/ * taskloop.c (GOMP_taskloop): Handle GOMP_TASK_FLAG_STRICT. * testsuite/libgomp.c-c++-common/taskloop-4.c (main): Fix up comment. * testsuite/libgomp.c-c++-common/taskloop-5.c: New test. --- gcc/tree.h.jj 2021-08-19 11:42:27.458421107 +0200 +++ gcc/tree.h 2021-08-20 18:22:28.743682537 +0200 @@ -1612,6 +1612,11 @@ class auto_suppress_location_wrappers #define OMP_CLAUSE_PRIORITY_EXPR(NODE) \ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_PRIORITY),0) +#define OMP_CLAUSE_GRAINSIZE_STRICT(NODE) \ + TREE_PRIVATE (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_GRAINSIZE)) +#define OMP_CLAUSE_NUM_TASKS_STRICT(NODE) \ + TREE_PRIVATE (OMP_CLAUSE_SUBCODE_CHECK (NODE, OMP_CLAUSE_NUM_TASKS)) + /* OpenACC clause expressions */ #define OMP_CLAUSE_EXPR(NODE, CLAUSE) \ OMP_CLAUSE_OPERAND (OMP_CLAUSE_SUBCODE_CHECK (NODE, CLAUSE), 0) --- gcc/tree-pretty-print.c.jj 2021-08-17 09:29:41.391205129 +0200 +++ gcc/tree-pretty-print.c 2021-08-20 18:23:32.522804918 +0200 @@ -1066,6 +1066,8 @@ dump_omp_clause (pretty_printer *pp, tre case OMP_CLAUSE_GRAINSIZE: pp_string (pp, "grainsize("); + if (OMP_CLAUSE_GRAINSIZE_STRICT (clause)) + pp_string (pp, "strict:"); dump_generic_node (pp, OMP_CLAUSE_GRAINSIZE_EXPR (clause), spc, flags, false); pp_right_paren (pp); @@ -1073,6 +1075,8 @@ dump_omp_clause (pretty_printer *pp, tre case OMP_CLAUSE_NUM_TASKS: pp_string (pp, "num_tasks("); + if (OMP_CLAUSE_NUM_TASKS_STRICT (clause)) + pp_string (pp, "strict:"); dump_generic_node (pp, OMP_CLAUSE_NUM_TASKS_EXPR (clause), spc, flags, false); pp_right_paren (pp); --- gcc/omp-expand.c.jj 2021-08-17 09:29:41.398205034 +0200 +++ gcc/omp-expand.c 2021-08-20 18:49:35.779449914 +0200 @@ -791,13 +791,19 @@ expand_task_call (struct omp_region *reg tree tclauses = gimple_omp_for_clauses (g); num_tasks = omp_find_clause (tclauses, OMP_CLAUSE_NUM_TASKS); if (num_tasks) - num_tasks = OMP_CLAUSE_NUM_TASKS_EXPR (num_tasks); + { + if (OMP_CLAUSE_NUM_TASKS_STRICT (num_tasks)) + iflags |= GOMP_TASK_FLAG_STRICT; + num_tasks = OMP_CLAUSE_NUM_TASKS_EXPR (num_tasks); + } else { num_tasks = omp_find_clause (tclauses, OMP_CLAUSE_GRAINSIZE); if (num_tasks) { iflags |= GOMP_TASK_FLAG_GRAINSIZE; + if (OMP_CLAUSE_GRAINSIZE_STRICT (num_tasks)) + iflags |= GOMP_TASK_FLAG_STRICT; num_tasks = OMP_CLAUSE_GRAINSIZE_EXPR (num_tasks); } else --- gcc/c/c-parser.c.jj 2021-08-20 11:36:30.964244616 +0200 +++ gcc/c/c-parser.c 2021-08-20 18:33:52.145278707 +0200 @@ -13786,7 +13786,10 @@ c_parser_omp_clause_num_threads (c_parse } /* OpenMP 4.5: - num_tasks ( expression ) */ + num_tasks ( expression ) + + OpenMP 5.1: + num_tasks ( strict : expression ) */ static tree c_parser_omp_clause_num_tasks (c_parser *parser, tree list) @@ -13795,6 +13798,17 @@ c_parser_omp_clause_num_tasks (c_parser matching_parens parens; if (parens.require_open (parser)) { + bool strict = false; + if (c_parser_next_token_is (parser, CPP_NAME) + && c_parser_peek_2nd_token (parser)->type == CPP_COLON + && strcmp (IDENTIFIER_POINTER (c_parser_peek_token (parser)->value), + "strict") == 0) + { + strict = true; + c_parser_consume_token (parser); + c_parser_consume_token (parser); + } + location_t expr_loc = c_parser_peek_token (parser)->location; c_expr expr = c_parser_expr_no_commas (parser, NULL); expr = convert_lvalue_to_rvalue (expr_loc, expr, false, true); @@ -13824,6 +13838,7 @@ c_parser_omp_clause_num_tasks (c_parser c = build_omp_clause (num_tasks_loc, OMP_CLAUSE_NUM_TASKS); OMP_CLAUSE_NUM_TASKS_EXPR (c) = t; + OMP_CLAUSE_NUM_TASKS_STRICT (c) = strict; OMP_CLAUSE_CHAIN (c) = list; list = c; } @@ -13832,7 +13847,10 @@ c_parser_omp_clause_num_tasks (c_parser } /* OpenMP 4.5: - grainsize ( expression ) */ + grainsize ( expression ) + + OpenMP 5.1: + grainsize ( strict : expression ) */ static tree c_parser_omp_clause_grainsize (c_parser *parser, tree list) @@ -13841,6 +13859,17 @@ c_parser_omp_clause_grainsize (c_parser matching_parens parens; if (parens.require_open (parser)) { + bool strict = false; + if (c_parser_next_token_is (parser, CPP_NAME) + && c_parser_peek_2nd_token (parser)->type == CPP_COLON + && strcmp (IDENTIFIER_POINTER (c_parser_peek_token (parser)->value), + "strict") == 0) + { + strict = true; + c_parser_consume_token (parser); + c_parser_consume_token (parser); + } + location_t expr_loc = c_parser_peek_token (parser)->location; c_expr expr = c_parser_expr_no_commas (parser, NULL); expr = convert_lvalue_to_rvalue (expr_loc, expr, false, true); @@ -13870,6 +13899,7 @@ c_parser_omp_clause_grainsize (c_parser c = build_omp_clause (grainsize_loc, OMP_CLAUSE_GRAINSIZE); OMP_CLAUSE_GRAINSIZE_EXPR (c) = t; + OMP_CLAUSE_GRAINSIZE_STRICT (c) = strict; OMP_CLAUSE_CHAIN (c) = list; list = c; } --- gcc/cp/parser.c.jj 2021-08-20 11:36:30.968244560 +0200 +++ gcc/cp/parser.c 2021-08-20 18:46:20.945085317 +0200 @@ -37237,7 +37237,10 @@ cp_parser_omp_clause_num_threads (cp_par } /* OpenMP 4.5: - num_tasks ( expression ) */ + num_tasks ( expression ) + + OpenMP 5.1: + num_tasks ( strict : expression ) */ static tree cp_parser_omp_clause_num_tasks (cp_parser *parser, tree list, @@ -37249,6 +37252,19 @@ cp_parser_omp_clause_num_tasks (cp_parse if (!parens.require_open (parser)) return list; + bool strict = false; + if (cp_lexer_next_token_is (parser->lexer, CPP_NAME) + && cp_lexer_nth_token_is (parser->lexer, 2, CPP_COLON)) + { + tree id = cp_lexer_peek_token (parser->lexer)->u.value; + if (!strcmp (IDENTIFIER_POINTER (id), "strict")) + { + strict = true; + cp_lexer_consume_token (parser->lexer); + cp_lexer_consume_token (parser->lexer); + } + } + t = cp_parser_assignment_expression (parser); if (t == error_mark_node @@ -37262,13 +37278,17 @@ cp_parser_omp_clause_num_tasks (cp_parse c = build_omp_clause (location, OMP_CLAUSE_NUM_TASKS); OMP_CLAUSE_NUM_TASKS_EXPR (c) = t; + OMP_CLAUSE_NUM_TASKS_STRICT (c) = strict; OMP_CLAUSE_CHAIN (c) = list; return c; } /* OpenMP 4.5: - grainsize ( expression ) */ + grainsize ( expression ) + + OpenMP 5.1: + grainsize ( strict : expression ) */ static tree cp_parser_omp_clause_grainsize (cp_parser *parser, tree list, @@ -37280,6 +37300,19 @@ cp_parser_omp_clause_grainsize (cp_parse if (!parens.require_open (parser)) return list; + bool strict = false; + if (cp_lexer_next_token_is (parser->lexer, CPP_NAME) + && cp_lexer_nth_token_is (parser->lexer, 2, CPP_COLON)) + { + tree id = cp_lexer_peek_token (parser->lexer)->u.value; + if (!strcmp (IDENTIFIER_POINTER (id), "strict")) + { + strict = true; + cp_lexer_consume_token (parser->lexer); + cp_lexer_consume_token (parser->lexer); + } + } + t = cp_parser_assignment_expression (parser); if (t == error_mark_node @@ -37293,6 +37326,7 @@ cp_parser_omp_clause_grainsize (cp_parse c = build_omp_clause (location, OMP_CLAUSE_GRAINSIZE); OMP_CLAUSE_GRAINSIZE_EXPR (c) = t; + OMP_CLAUSE_GRAINSIZE_STRICT (c) = strict; OMP_CLAUSE_CHAIN (c) = list; return c; --- include/gomp-constants.h.jj 2021-01-16 22:52:33.673413185 +0100 +++ include/gomp-constants.h 2021-08-20 18:17:39.316666260 +0200 @@ -222,6 +222,7 @@ enum gomp_map_kind #define GOMP_TASK_FLAG_NOGROUP (1 << 11) #define GOMP_TASK_FLAG_REDUCTION (1 << 12) #define GOMP_TASK_FLAG_DETACH (1 << 13) +#define GOMP_TASK_FLAG_STRICT (1 << 14) /* GOMP_target{_ext,update_ext,enter_exit_data} flags argument. */ #define GOMP_TARGET_FLAG_NOWAIT (1 << 0) --- libgomp/taskloop.c.jj 2021-05-11 23:40:52.744338169 +0200 +++ libgomp/taskloop.c 2021-08-22 14:37:56.859984138 +0200 @@ -97,6 +97,7 @@ GOMP_taskloop (void (*fn) (void *), void #endif TYPE task_step = step; + TYPE nfirst_task_step = step; unsigned long nfirst = n; if (flags & GOMP_TASK_FLAG_GRAINSIZE) { @@ -109,7 +110,22 @@ GOMP_taskloop (void (*fn) (void *), void if (num_tasks != ndiv) num_tasks = ~0UL; #endif - if (num_tasks <= 1) + if ((flags & GOMP_TASK_FLAG_STRICT) + && num_tasks != ~0ULL) + { + UTYPE mod = n % grainsize; + task_step = (TYPE) grainsize * step; + if (mod) + { + num_tasks++; + nfirst_task_step = (TYPE) mod * step; + if (num_tasks == 1) + task_step = nfirst_task_step; + else + nfirst = num_tasks - 2; + } + } + else if (num_tasks <= 1) { num_tasks = 1; task_step = end - start; @@ -124,6 +140,7 @@ GOMP_taskloop (void (*fn) (void *), void task_step = (TYPE) grainsize * step; if (mul != n) { + nfirst_task_step = task_step; task_step += step; nfirst = n - mul - 1; } @@ -135,6 +152,7 @@ GOMP_taskloop (void (*fn) (void *), void task_step = (TYPE) div * step; if (mod) { + nfirst_task_step = task_step; task_step += step; nfirst = mod - 1; } @@ -153,6 +171,7 @@ GOMP_taskloop (void (*fn) (void *), void task_step = (TYPE) div * step; if (mod) { + nfirst_task_step = task_step; task_step += step; nfirst = mod - 1; } @@ -225,7 +244,7 @@ GOMP_taskloop (void (*fn) (void *), void start += task_step; ((TYPE *)arg)[1] = start; if (i == nfirst) - task_step -= step; + task_step = nfirst_task_step; fn (arg); arg += arg_size; if (!priority_queue_empty_p (&task[i].children_queue, @@ -258,7 +277,7 @@ GOMP_taskloop (void (*fn) (void *), void start += task_step; ((TYPE *)data)[1] = start; if (i == nfirst) - task_step -= step; + task_step = nfirst_task_step; fn (data); if (!priority_queue_empty_p (&task.children_queue, MEMMODEL_RELAXED)) @@ -303,7 +322,7 @@ GOMP_taskloop (void (*fn) (void *), void start += task_step; ((TYPE *)arg)[1] = start; if (i == nfirst) - task_step -= step; + task_step = nfirst_task_step; thr->task = parent; task->kind = GOMP_TASK_WAITING; task->fn = fn; --- libgomp/testsuite/libgomp.c-c++-common/taskloop-4.c.jj 2020-01-12 11:54:39.029373941 +0100 +++ libgomp/testsuite/libgomp.c-c++-common/taskloop-4.c 2021-08-20 19:19:27.613993520 +0200 @@ -85,7 +85,8 @@ main () if (test (7, 21, 2, 15, grainsize, &ntasks, &min_iters, &max_iters) != 7 || ntasks != 1 || min_iters != 7 || max_iters != 7) __builtin_abort (); - /* If num_tasks is present, # of task loop iters is min (# of loop iters, num_tasks). */ + /* If num_tasks is present, # of tasks is min (# of loop iters, num_tasks) + and each task has at least one iteration. */ if (test (-51, 2500, 48, 9, num_tasks, &ntasks, &min_iters, &max_iters) != 54 || ntasks != 9) __builtin_abort (); --- libgomp/testsuite/libgomp.c-c++-common/taskloop-5.c.jj 2021-08-20 18:58:21.594313604 +0200 +++ libgomp/testsuite/libgomp.c-c++-common/taskloop-5.c 2021-08-22 14:14:55.859105770 +0200 @@ -0,0 +1,135 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +int u[64], v, w[64]; + +__attribute__((noinline, noclone)) int +test (int a, int b, int c, int d, void (*fn) (int, int, int, int), + int *num_tasks, int *min_iters, int *max_iters, int *sep) +{ + int i, j, t = 0; + __builtin_memset (u, 0, sizeof u); + v = 0; + fn (a, b, c, d); + *min_iters = 0; + *max_iters = 0; + *num_tasks = v; + *sep = v; + if (v) + { + *min_iters = u[0]; + *max_iters = u[0]; + t = u[0]; + for (i = 1; i < v; i++) + { + if (*min_iters > u[i]) + *min_iters = u[i]; + if (*max_iters < u[i]) + *max_iters = u[i]; + t += u[i]; + } + if (*min_iters != *max_iters) + { + for (i = 0; i < v - 1; i++) + { + int min_idx = i; + for (j = i + 1; j < v; j++) + if (w[min_idx] > w[j]) + min_idx = j; + if (min_idx != i) + { + int tem = u[i]; + u[i] = u[min_idx]; + u[min_idx] = tem; + tem = w[i]; + w[i] = w[min_idx]; + w[min_idx] = tem; + } + } + if (u[0] != *max_iters) + __builtin_abort (); + for (i = 1; i < v; i++) + if (u[i] != u[i - 1]) + { + if (*sep != v || u[i] != *min_iters) + __builtin_abort (); + *sep = i; + } + } + } + return t; +} + +void +grainsize (int a, int b, int c, int d) +{ + int i, j = 0, k = 0; + #pragma omp taskloop firstprivate (j, k) grainsize(strict:d) + for (i = a; i < b; i += c) + { + if (j == 0) + { + #pragma omp atomic capture + k = v++; + if (k >= 64) + __builtin_abort (); + w[k] = i; + } + u[k] = ++j; + } +} + +void +num_tasks (int a, int b, int c, int d) +{ + int i, j = 0, k = 0; + #pragma omp taskloop firstprivate (j, k) num_tasks(strict:d) + for (i = a; i < b; i += c) + { + if (j == 0) + { + #pragma omp atomic capture + k = v++; + if (k >= 64) + __builtin_abort (); + w[k] = i; + } + u[k] = ++j; + } +} + +int +main () +{ + #pragma omp parallel + #pragma omp single + { + int min_iters, max_iters, ntasks, sep; + /* If grainsize is present and has strict modifier, # of task loop iters is == grainsize, + except that it can be smaller on the last task. */ + if (test (0, 79, 1, 17, grainsize, &ntasks, &min_iters, &max_iters, &sep) != 79 + || ntasks != 5 || min_iters != 11 || max_iters != 17 || sep != 4) + __builtin_abort (); + if (test (-49, 2541, 7, 28, grainsize, &ntasks, &min_iters, &max_iters, &sep) != 370 + || ntasks != 14 || min_iters != 6 || max_iters != 28 || sep != 13) + __builtin_abort (); + if (test (7, 21, 2, 15, grainsize, &ntasks, &min_iters, &max_iters, &sep) != 7 + || ntasks != 1 || min_iters != 7 || max_iters != 7 || sep != 1) + __builtin_abort (); + /* If num_tasks is present, # of tasks is min (# of loop iters, num_tasks) + and each task has at least one iteration. If strict modifier is present, + first set of tasks has ceil (# of loop iters / num_tasks) iterations, + followed by possibly empty set of tasks with floor (# of loop iters / num_tasks) + iterations. */ + if (test (-51, 2500, 48, 9, num_tasks, &ntasks, &min_iters, &max_iters, &sep) != 54 + || ntasks != 9 || min_iters != 6 || max_iters != 6 || sep != 9) + __builtin_abort (); + if (test (0, 57, 1, 9, num_tasks, &ntasks, &min_iters, &max_iters, &sep) != 57 + || ntasks != 9 || min_iters != 6 || max_iters != 7 || sep != 3) + __builtin_abort (); + if (test (0, 25, 2, 17, num_tasks, &ntasks, &min_iters, &max_iters, &sep) != 13 + || ntasks != 13 || min_iters != 1 || max_iters != 1 || sep != 13) + __builtin_abort (); + } + return 0; +} Jakub