On 07/07/2014 02:55 AM, Thomas Schwinge wrote: > On Sun, 6 Jul 2014 16:10:56 -0700, Cesar Philippidis > <cesar_philippi...@mentor.com> wrote: >> This patch is the first step to enabling parallel reductions in openacc. > > Thanks! > >> As mentioned earlier, this patch isn't complete yet. For starters, parts >> of it depends on our internal ptx backend. I've temporarily remapped the >> ptx dependencies to their openmp equivalent, but without a proper >> openacc runtime this infrastructure won't do much. > > For the curious: we're working on preparing our implementation of the > OpenACC Runtime Library for upstream submission; if only the weeks had > more days... > >> Thomas, is this patch OK for gomp-4_0-branch? > > I still :-( haven't managed to allocate the time for a proper review, but > given this doesn't regress any existing test cases, it's fine to commit, > and then we can take it from there. > > A few minor comments: > >> 2014-07-06 Cesar Philippidis <ce...@codesourcery.com> >> Thomas Schwinge <tho...@codesourcery.com> > > By the way, on gomp-4_0-branch, ChangeLog snippets go into the respective > ChangeLog.gomp files. > >> --- a/gcc/c/c-parser.c >> +++ b/gcc/c/c-parser.c >> @@ -11706,7 +11710,8 @@ c_parser_oacc_kernels (location_t loc, c_parser >> *parser, char *p_name) >> */ >> >> #define OACC_LOOP_CLAUSE_MASK >> \ >> - (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_NONE) >> + ( (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_COLLAPSE) \ > > Not yet. ;-) > >> + | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_REDUCTION)) > >> --- a/gcc/fortran/types.def >> +++ b/gcc/fortran/types.def >> @@ -86,6 +86,7 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_UINT, BT_UINT, BT_UINT) >> DEF_FUNCTION_TYPE_1 (BT_FN_PTR_PTR, BT_PTR, BT_PTR) >> DEF_FUNCTION_TYPE_1 (BT_FN_VOID_INT, BT_VOID, BT_INT) >> DEF_FUNCTION_TYPE_1 (BT_FN_BOOL_INT, BT_BOOL, BT_INT) >> +DEF_FUNCTION_TYPE_1 (BT_FN_INT_INT, BT_INT, BT_INT) > > That one's not actually needed, because... > >> --- a/gcc/omp-builtins.def >> +++ b/gcc/omp-builtins.def >> @@ -236,3 +236,6 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_UPDATE, >> "GOMP_target_update", >> BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST) >> DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS, "GOMP_teams", >> BT_FN_VOID_UINT_UINT, ATTR_NOTHROW_LIST) >> + >> +DEF_GOMP_BUILTIN (BUILT_IN_OMP_SET_NUM_THREADS, "omp_set_num_threads", >> + BT_FN_INT_INT, ATTR_CONST_NOTHROW_LEAF_LIST) > > ... it's actually »void omp_set_num_threads (int)«, so BT_FN_VOID_INT. > As this is only temporary code, please add a FIXME comment here. Hmm, > and I wonder, given this is using DEF_*GOMP*_BUILTIN, does this actually > do the right thing if -openmp is not specified?
Thanks for catching those problems! I've committed this updated version of the patch. Cesar
2014-07-08 Cesar Philippidis <ce...@codesourcery.com> Thomas Schwinge <tho...@codesourcery.com> gcc/ * omp-low.c (omp_get_id): New function. (lookup_reduction): New function. (maybe_lookup_reduction): New function. (build_outer_var_ref): Remove openacc assert. (new_omp_context): Preserve ctx->reduction_map. (scan_sharing_clauses): Handle OMP_CLAUSE_REDUCTION. (scan_oacc_offload): Initialize ctx->reduction_map. (lower_reduction_clauses): Handle OpenACC reductions. (omp_gimple_assign_with_ops): New function. (initialize_reduction_data): New function. (finalize_reduction_data): New function. (process_reduction_data): New function. (lower_oacc_offload): Handle reductions. * gcc/omp-builtins.def (BUILT_IN_OMP_SET_NUM_THREADS): New. gcc/c/ * c-parser.c (c_parser_oacc_all_clauses): Handle PRAGMA_OMP_CLAUSE_REDUCTION. (OACC_LOOP_CLAUSE_MASK, OACC_PARALLEL_CLAUSE_MASK): Add PRAGMA_OMP_CLAUSE_REDUCTION. gcc/testsuite/ * gcc/testsuite/c-c++-common/goacc/reduction-1.c: New test. * gcc/testsuite/c-c++-common/goacc/reduction-2.c: New test. * gcc/testsuite/c-c++-common/goacc/reduction-3.c: New test. * gcc/testsuite/c-c++-common/goacc/reduction-4.c: New test. diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 03852b4..6a9271f 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -11332,6 +11332,10 @@ c_parser_oacc_all_clauses (c_parser *parser, omp_clause_mask mask, clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "present_or_create"; break; + case PRAGMA_OMP_CLAUSE_REDUCTION: + clauses = c_parser_omp_clause_reduction (parser, clauses); + c_name = "reduction"; + break; case PRAGMA_OMP_CLAUSE_SELF: clauses = c_parser_oacc_data_clause (parser, c_kind, clauses); c_name = "self"; @@ -11706,7 +11710,7 @@ c_parser_oacc_kernels (location_t loc, c_parser *parser, char *p_name) */ #define OACC_LOOP_CLAUSE_MASK \ - (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_NONE) + (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_REDUCTION) static tree c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name) @@ -11746,6 +11750,7 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name) | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_PRESENT_OR_COPYIN) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_PRESENT_OR_COPYOUT) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_PRESENT_OR_CREATE) \ + | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_REDUCTION) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OMP_CLAUSE_VECTOR_LENGTH) ) static tree diff --git a/gcc/omp-builtins.def b/gcc/omp-builtins.def index 08b825c..698dc79 100644 --- a/gcc/omp-builtins.def +++ b/gcc/omp-builtins.def @@ -236,3 +236,6 @@ DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TARGET_UPDATE, "GOMP_target_update", BT_FN_VOID_INT_PTR_SIZE_PTR_PTR_PTR, ATTR_NOTHROW_LIST) DEF_GOMP_BUILTIN (BUILT_IN_GOMP_TEAMS, "GOMP_teams", BT_FN_VOID_UINT_UINT, ATTR_NOTHROW_LIST) + +DEF_GOMP_BUILTIN (BUILT_IN_OMP_SET_NUM_THREADS, "omp_set_num_threads", + BT_FN_VOID_INT, ATTR_CONST_NOTHROW_LEAF_LIST) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index cd27b76..219d5fe 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -158,6 +158,11 @@ typedef struct omp_context construct. In the case of a parallel, this is in the child function. */ tree block_vars; + /* A map of reduction pointer variables. For accelerators, each + reduction variable is replaced with an array. Each thread, in turn, + is assigned to a slot on that array. */ + splay_tree reduction_map; + /* Label to which GOMP_cancel{,llation_point} and explicit and implicit barriers should jump to during omplower pass. */ tree cancel_label; @@ -221,6 +226,17 @@ static tree scan_omp_1_op (tree *, int *, void *); *handled_ops_p = false; \ break; +/* Helper function to get the reduction array name */ +static const char * +omp_get_id (tree node) +{ + const char *id = IDENTIFIER_POINTER (DECL_NAME (node)); + int len = strlen ("omp$") + strlen (id); + char *temp_name = (char *)alloca (len+1); + snprintf (temp_name, len+1, "gfc$%s", id); + return IDENTIFIER_POINTER(get_identifier (temp_name)); +} + /* Holds a decl for __OPENMP_TARGET__. */ static GTY(()) tree offload_symbol_decl; @@ -873,6 +889,17 @@ lookup_sfield (tree var, omp_context *ctx) } static inline tree +lookup_reduction (const char *id, omp_context *ctx) +{ + gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt)); + + splay_tree_node n; + n = splay_tree_lookup (ctx->reduction_map, + (splay_tree_key) id); + return (tree) n->value; +} + +static inline tree maybe_lookup_field (tree var, omp_context *ctx) { splay_tree_node n; @@ -880,6 +907,17 @@ maybe_lookup_field (tree var, omp_context *ctx) return n ? (tree) n->value : NULL_TREE; } +static inline tree +maybe_lookup_reduction (tree var, omp_context *ctx) +{ + gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt)); + + splay_tree_node n; + n = splay_tree_lookup (ctx->reduction_map, + (splay_tree_key) var); + return n ?(tree) n->value : NULL_TREE; +} + /* Return true if DECL should be copied by pointer. SHARED_CTX is the parallel context if DECL is to be shared. */ @@ -1036,8 +1074,6 @@ build_receiver_ref (tree var, bool by_ref, omp_context *ctx) static tree build_outer_var_ref (tree var, omp_context *ctx) { - gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt)); - tree x; if (is_global_var (maybe_lookup_decl_in_outer_ctx (var, ctx))) @@ -1379,6 +1415,8 @@ new_omp_context (gimple stmt, omp_context *outer_ctx) ctx->cb = outer_ctx->cb; ctx->cb.block = NULL; ctx->depth = outer_ctx->depth + 1; + /* FIXME: handle reductions recursively. */ + ctx->reduction_map = outer_ctx->reduction_map; } else { @@ -1392,6 +1430,7 @@ new_omp_context (gimple stmt, omp_context *outer_ctx) ctx->cb.eh_lp_nr = 0; ctx->cb.transform_call_graph_edges = CB_CGE_MOVE; ctx->depth = 1; + //TODO ctx->reduction_map = TODO; } ctx->cb.decl_map = pointer_map_create (); @@ -1588,7 +1627,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) /* FALLTHRU */ case OMP_CLAUSE_FIRSTPRIVATE: - case OMP_CLAUSE_REDUCTION: if (is_gimple_omp_oacc_specifically (ctx->stmt)) { sorry ("clause not supported yet"); @@ -1596,6 +1634,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) } case OMP_CLAUSE_LINEAR: gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt)); + case OMP_CLAUSE_REDUCTION: decl = OMP_CLAUSE_DECL (c); do_private: if (is_variable_sized (decl)) @@ -1621,6 +1660,28 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) install_var_field (decl, by_ref, 3, ctx); } install_var_local (decl, ctx); + //TODO + if (is_gimple_omp_oacc_specifically (ctx->stmt)) + { + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION) + { + /* Create a decl for the reduction array. */ + tree var = OMP_CLAUSE_DECL (c); + tree ptype = build_pointer_type (TREE_TYPE (var)); + tree array = create_tmp_var (ptype, omp_get_id (var)); + omp_context *c = (ctx->field_map ? ctx : ctx->outer); + install_var_field (array, true, 3, c); + install_var_local (array, c); + + /* Insert it into the current context. */ + splay_tree_insert (ctx->reduction_map, + (splay_tree_key) omp_get_id(var), + (splay_tree_value) array); + splay_tree_insert (ctx->reduction_map, + (splay_tree_key) array, + (splay_tree_value) array); + } + } break; case OMP_CLAUSE__LOOPTEMP_: @@ -1658,10 +1719,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) case OMP_CLAUSE_NUM_WORKERS: case OMP_CLAUSE_VECTOR_LENGTH: if (ctx->outer) - { - gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt)); scan_omp_op (&OMP_CLAUSE_OPERAND (c, 0), ctx->outer); - } break; case OMP_CLAUSE_TO: @@ -1750,7 +1808,16 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) && TREE_CODE (TREE_TYPE (decl)) == ARRAY_TYPE) install_var_field (decl, true, 7, ctx); else - install_var_field (decl, true, 3, ctx); + { + if (!is_gimple_omp_oacc_specifically (ctx->stmt)) + install_var_field (decl, true, 3, ctx); + else + { + /* decl goes heres. */ + omp_context *c = (ctx->field_map ? ctx : ctx->outer); + install_var_field (decl, true, 3, c); + } + } if (is_gimple_omp_offloaded (ctx->stmt)) install_var_local (decl, ctx); } @@ -1844,7 +1911,6 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) /* FALLTHRU */ case OMP_CLAUSE_FIRSTPRIVATE: - case OMP_CLAUSE_REDUCTION: if (is_gimple_omp_oacc_specifically (ctx->stmt)) { sorry ("clause not supported yet"); @@ -1852,6 +1918,7 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) } case OMP_CLAUSE_LINEAR: gcc_assert (!is_gimple_omp_oacc_specifically (ctx->stmt)); + case OMP_CLAUSE_REDUCTION: case OMP_CLAUSE_PRIVATE: decl = OMP_CLAUSE_DECL (c); if (is_variable_sized (decl)) @@ -2161,6 +2228,7 @@ scan_oacc_offload (gimple stmt, omp_context *outer_ctx) DECL_NAMELESS (name) = 1; TYPE_NAME (ctx->record_type) = name; create_omp_child_function (ctx, false); + ctx->reduction_map = splay_tree_new (splay_tree_compare_pointers, 0, 0); gimple_omp_set_child_fn (stmt, ctx->cb.dst_fn); @@ -4211,6 +4279,8 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx) if (count == 1) { + if (!is_gimple_omp_oacc_specifically (ctx->stmt)) + { tree addr = build_fold_addr_expr_loc (clause_loc, ref); addr = save_expr (addr); @@ -4219,6 +4289,117 @@ lower_reduction_clauses (tree clauses, gimple_seq *stmt_seqp, omp_context *ctx) x = build2 (OMP_ATOMIC, void_type_node, addr, x); gimplify_and_add (x, stmt_seqp); return; + } + else + { + /* The atomic add at the end of the sum creates unnecessary + write contention on accelerators. To work around that, + create an array or vector_length and assign an element to + each thread. Later, in lower_omp_for (for openacc), the + values of array will be combined. */ + + tree t = NULL_TREE, array, nthreads; + + /* First ensure that the current tid is less than vector_length. */ + tree exit_label = create_artificial_label (UNKNOWN_LOCATION); + tree reduction_label = create_artificial_label (UNKNOWN_LOCATION); + + /* Get the current thread id. */ + tree call = builtin_decl_explicit (BUILT_IN_OMP_GET_THREAD_NUM); + gimple stmt = gimple_build_call (call, 1, integer_zero_node); + tree fntype = gimple_call_fntype (stmt); + tree tid = create_tmp_var (TREE_TYPE (fntype), NULL); + gimple_call_set_lhs (stmt, tid); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Find the total number of threads. A reduction clause + only appears inside a loop construction or a combined + parallel and loop construct. */ + tree c; + + if (gimple_code (ctx->stmt) == GIMPLE_OMP_FOR) + c = gimple_oacc_parallel_clauses (ctx->outer->stmt); + else + c = gimple_oacc_parallel_clauses (ctx->stmt); + + t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH); + + if (t) + { + t = fold_convert_loc (OMP_CLAUSE_LOCATION (t), + integer_type_node, + OMP_CLAUSE_VECTOR_LENGTH_EXPR (t)); + } + + if (!t) + t = integer_one_node; + + /* Extract the number of threads. */ + nthreads = create_tmp_var (sizetype, NULL); + gimplify_assign (nthreads, fold_build1 (NOP_EXPR, sizetype, t), + stmt_seqp); + stmt = gimple_build_assign_with_ops (MINUS_EXPR, nthreads, nthreads, + fold_build1 (NOP_EXPR, sizetype, + integer_one_node)); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* If tid >= nthreads, goto exit_label. */ + t = create_tmp_var (sizetype, NULL); + gimplify_assign (t, fold_build1 (NOP_EXPR, sizetype, tid), + stmt_seqp); + stmt = gimple_build_cond (GT_EXPR, t, nthreads, exit_label, + reduction_label); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Place the reduction_label here. */ + + gimple_seq_add_stmt (stmt_seqp, + gimple_build_label (reduction_label)); + + /* Now insert the partial reductions into the array. */ + + /* Create an array for the reduction variable and install it + in the parent scope. */ + tree ptype = build_pointer_type (TREE_TYPE (var)); + + t = lookup_reduction (omp_get_id (var), ctx); + t = build_receiver_ref (t, false, ctx->outer); + + array = create_tmp_var (ptype, NULL); + gimplify_assign (array, t, stmt_seqp); + + tree ptr = create_tmp_var (TREE_TYPE (array), NULL); + + /* Find the reduction array. */ + + /* testing a unary conversion. */ + tree offset = create_tmp_var (sizetype, NULL); + gimplify_assign (offset, TYPE_SIZE_UNIT (TREE_TYPE (var)), + stmt_seqp); + t = create_tmp_var (sizetype, NULL); + gimplify_assign (t, unshare_expr (fold_build1 (NOP_EXPR, sizetype, + tid)), + stmt_seqp); + stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, t); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Offset expression. Does the POINTER_PLUS_EXPR take care + of adding sizeof(var) to the array? */ + ptr = create_tmp_var (ptype, NULL); + stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, + unshare_expr(ptr), + array, offset); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Move the local sum to gfc$sum[i]. */ + x = unshare_expr (build_simple_mem_ref (ptr)); + stmt = gimplify_assign (x, new_var, stmt_seqp); + + /* Place exit label here. */ + gimple_seq_add_stmt (stmt_seqp, gimple_build_label (exit_label)); + + return; + } } if (OMP_CLAUSE_REDUCTION_PLACEHOLDER (c)) @@ -9138,6 +9319,410 @@ make_pass_expand_omp (gcc::context *ctxt) return new pass_expand_omp (ctxt); } +/* Helper function to preform, potentially COMPLEX_TYPE, operation and + convert it to gimple. */ +static void +omp_gimple_assign_with_ops (tree_code op, tree dest, tree src, gimple_seq *seq) +{ + gimple stmt; + + if (TREE_CODE (TREE_TYPE (dest)) != COMPLEX_TYPE) + { + stmt = gimple_build_assign_with_ops (op, dest, dest, src); + gimple_seq_add_stmt (seq, stmt); + return; + } + + tree t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree rdest = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest); + gimplify_assign (t, rdest, seq); + rdest = t; + + t = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree idest = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (dest)), dest); + gimplify_assign (t, idest, seq); + idest = t; + + t = create_tmp_var (TREE_TYPE (TREE_TYPE (src)), NULL); + tree rsrc = fold_build1 (REALPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src); + gimplify_assign (t, rsrc, seq); + rsrc = t; + + t = create_tmp_var (TREE_TYPE (TREE_TYPE (src)), NULL); + tree isrc = fold_build1 (IMAGPART_EXPR, TREE_TYPE (TREE_TYPE (src)), src); + gimplify_assign (t, isrc, seq); + isrc = t; + + tree r = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree i = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree result; + + gcc_assert (op == PLUS_EXPR || op == MULT_EXPR); + + if (op == PLUS_EXPR) + { + stmt = gimple_build_assign_with_ops (op, r, rdest, rsrc); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (op, i, idest, isrc); + gimple_seq_add_stmt (seq, stmt); + } + else if (op == MULT_EXPR) + { + /* Let x = a + ib = dest, y = c + id = src. + x * y = (ac - bd) + i(ad + bc) */ + tree ac = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree bd = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree ad = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + tree bc = create_tmp_var (TREE_TYPE (TREE_TYPE (dest)), NULL); + + stmt = gimple_build_assign_with_ops (MULT_EXPR, ac, rdest, rsrc); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (MULT_EXPR, bd, idest, isrc); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (MINUS_EXPR, r, ac, bd); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (MULT_EXPR, ad, rdest, isrc); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (MULT_EXPR, bd, idest, rsrc); + gimple_seq_add_stmt (seq, stmt); + + stmt = gimple_build_assign_with_ops (PLUS_EXPR, i, ad, bc); + gimple_seq_add_stmt (seq, stmt); + } + + result = build2 (COMPLEX_EXPR, TREE_TYPE (dest), r, i); + gimplify_assign (dest, result, seq); +} + +/* Helper function to initialize local data for the reduction arrays. + The reduction arrays need to be placed inside the calling function + for accelerators, or else the host won't be able to preform the final + reduction. FIXME: This function assumes that there are + vector_length threads in total. */ + +static void +initialize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp, + omp_context *ctx) +{ + gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt)); + + tree c, t, oc; + gimple stmt; + omp_context *octx; + tree (*gimple_omp_clauses) (const_gimple); + void (*gimple_omp_set_clauses) (gimple, tree); + + /* Find the innermost PARALLEL openmp context. FIXME: OpenACC kernels + may require extra care unless they are converted to openmp for loops. */ + + if (gimple_code (ctx->stmt) == GIMPLE_OACC_PARALLEL) + octx = ctx; + else + octx = ctx->outer; + + gimple_omp_clauses = gimple_oacc_parallel_clauses; + gimple_omp_set_clauses = gimple_oacc_parallel_set_clauses; + + /* Extract the clauses. */ + oc = gimple_omp_clauses (octx->stmt); + + /* Find the last outer clause. */ + for (; oc && OMP_CLAUSE_CHAIN (oc); oc = OMP_CLAUSE_CHAIN (oc)) + ; + + /* Allocate arrays for each reduction variable. */ + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION) + continue; + + tree var = OMP_CLAUSE_DECL (c); + tree array = lookup_reduction (omp_get_id (var), ctx); + tree size, call; + + /* Calculate size of the reduction array. */ + t = create_tmp_var (TREE_TYPE (nthreads), NULL); + stmt = gimple_build_assign_with_ops (MULT_EXPR, t, nthreads, + fold_convert (TREE_TYPE (nthreads), + TYPE_SIZE_UNIT (TREE_TYPE (var)))); + gimple_seq_add_stmt (stmt_seqp, stmt); + + size = create_tmp_var (sizetype, NULL); + gimplify_assign (size, fold_build1 (NOP_EXPR, sizetype, t), stmt_seqp); + + /* Now allocate memory for it. FIXME: Allocating memory for the + reduction array may be unnecessary once the final reduction is able + to be preformed on the accelerator. Instead of allocating memory on + the host side, it could just be allocated on the accelerator. */ + call = unshare_expr (builtin_decl_explicit (BUILT_IN_ALLOCA)); + stmt = gimple_build_call (call, 1, size); + gimple_call_set_lhs (stmt, array); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Map this array into the accelerator. */ + + /* Add the reduction array to the list of clauses. */ + /* FIXME: Currently, these variables must be placed in the outer + most clause so that copy-out works. */ + tree x = array; + t = build_omp_clause (gimple_location (ctx->stmt), OMP_CLAUSE_MAP); + OMP_CLAUSE_MAP_KIND (t) = OMP_CLAUSE_MAP_FORCE_FROM; + OMP_CLAUSE_DECL (t) = x; + OMP_CLAUSE_CHAIN (t) = NULL; + if (oc) + OMP_CLAUSE_CHAIN (oc) = t; + else + gimple_omp_set_clauses (octx->stmt, t); + OMP_CLAUSE_SIZE (t) = size; + oc = t; + } +} + +/* Helper function to finalize local data for the reduction arrays. The + reduction array needs to be reduced to the original reduction variable. + FIXME: This function assumes that there are vector_length threads in + total. Also, it assumes that there are at least vector_length iterations + in the for loop. */ + +static void +finalize_reduction_data (tree clauses, tree nthreads, gimple_seq *stmt_seqp, + omp_context *ctx) +{ + gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt)); + + tree c, var, array, loop_header, loop_body, loop_exit; + gimple stmt; + + /* Create for loop. + + let var = the original reduction variable + let array = reduction variable array + + var = array[0] + for (i = 1; i < nthreads; i++) + var op= array[i] + */ + + loop_header = create_artificial_label (UNKNOWN_LOCATION); + loop_body = create_artificial_label (UNKNOWN_LOCATION); + loop_exit = create_artificial_label (UNKNOWN_LOCATION); + + /* Initialize the reduction variables to be value of the first array + element. */ + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION) + continue; + + tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c); + + /* reduction(-:var) sums up the partial results, so it acts + identically to reduction(+:var). */ + if (reduction_code == MINUS_EXPR) + reduction_code = PLUS_EXPR; + + /* Set up reduction variable, var. Becuase it's not gimple register, + it needs to be treated as a reference. */ + var = OMP_CLAUSE_DECL (c); + + tree ptr = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx); + + /* Extract array[ix] into mem. */ + tree mem = create_tmp_var (TREE_TYPE (var), NULL); + gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp); + + /* Find the original reduction variable. */ + tree new_var = lookup_decl (var, ctx); + tree x = build_outer_var_ref (var, ctx); + if (is_reference (var)) + new_var = build_simple_mem_ref (new_var); + + x = lang_hooks.decls.omp_clause_assign_op (c, var, mem); + gimplify_and_add (unshare_expr(x), stmt_seqp); + } + + /* Create an index variable and set it to one. */ + tree ix = create_tmp_var (sizetype, NULL); + gimplify_assign (ix, fold_build1 (NOP_EXPR, sizetype, integer_one_node), + stmt_seqp); + + /* Insert the loop header label here. */ + gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_header)); + + /* Loop if ix >= nthreads. */ + tree x = create_tmp_var (sizetype, NULL); + gimplify_assign (x, fold_build1 (NOP_EXPR, sizetype, nthreads), stmt_seqp); + stmt = gimple_build_cond (GE_EXPR, ix, x, loop_exit, loop_body); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Insert the loop body label here. */ + gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_body)); + + /* Collapse each reduction array, one element at a time. */ + for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + if (OMP_CLAUSE_CODE (c) != OMP_CLAUSE_REDUCTION) + continue; + + tree_code reduction_code = OMP_CLAUSE_REDUCTION_CODE (c); + + /* reduction(-:var) sums up the partial results, so it acts + identically to reduction(+:var). */ + if (reduction_code == MINUS_EXPR) + reduction_code = PLUS_EXPR; + + /* Set up reduction variable var. */ + var = OMP_CLAUSE_DECL (c); + + array = lookup_reduction (omp_get_id (OMP_CLAUSE_DECL (c)), ctx); + + /* Calculate the array offset. */ + tree offset = create_tmp_var (sizetype, NULL); + gimplify_assign (offset, TYPE_SIZE_UNIT (TREE_TYPE (var)), stmt_seqp); + stmt = gimple_build_assign_with_ops (MULT_EXPR, offset, offset, ix); + gimple_seq_add_stmt (stmt_seqp, stmt); + + tree ptr = create_tmp_var (TREE_TYPE (array), NULL); + stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, ptr, array, + offset); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Extract array[ix] into mem. */ + tree mem = create_tmp_var (TREE_TYPE (var), NULL); + gimplify_assign (mem, build_simple_mem_ref (ptr), stmt_seqp); + + /* Find the original reduction variable. */ + tree new_var = lookup_decl (var, ctx); + tree x = build_outer_var_ref (var, ctx); + if (is_reference (var)) + new_var = build_simple_mem_ref (new_var); + + tree t = create_tmp_var (TREE_TYPE (var), NULL); + + x = lang_hooks.decls.omp_clause_assign_op (c, t, var); + gimplify_and_add (unshare_expr(x), stmt_seqp); + + /* var = var op mem */ + switch (OMP_CLAUSE_REDUCTION_CODE (c)) + { + case TRUTH_ANDIF_EXPR: + case TRUTH_ORIF_EXPR: + t = fold_build2 (OMP_CLAUSE_REDUCTION_CODE (c), integer_type_node, + t, mem); + gimplify_and_add (t, stmt_seqp); + break; + default: + /* The lhs isn't a gimple_reg when var is COMPLEX_TYPE. */ + omp_gimple_assign_with_ops (OMP_CLAUSE_REDUCTION_CODE (c), + t, mem, stmt_seqp); + } + + t = fold_build1 (NOP_EXPR, TREE_TYPE (var), t); + x = lang_hooks.decls.omp_clause_assign_op (c, var, t); + gimplify_and_add (unshare_expr(x), stmt_seqp); + } + + /* Increment the induction variable. */ + tree one = fold_build1 (NOP_EXPR, sizetype, integer_one_node); + stmt = gimple_build_assign_with_ops (PLUS_EXPR, ix, ix, one); + gimple_seq_add_stmt (stmt_seqp, stmt); + + /* Go back to the top of the loop. */ + gimple_seq_add_stmt (stmt_seqp, gimple_build_goto (loop_header)); + + /* Place the loop exit label here. */ + gimple_seq_add_stmt (stmt_seqp, gimple_build_label (loop_exit)); +} + +/* Scan through all of the gimple stmts searching for an OMP_FOR_EXPR, and + scan that for reductions. */ + +static void +process_reduction_data (gimple_seq *body, gimple_seq *in_stmt_seqp, + gimple_seq *out_stmt_seqp, omp_context *ctx) +{ + gcc_assert (is_gimple_omp_oacc_specifically (ctx->stmt)); + + gimple_stmt_iterator gsi; + + for (gsi = gsi_start (*body); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + tree call; + + switch (gimple_code (stmt)) + { + case GIMPLE_OMP_FOR: + tree clauses, nthreads, t; + + clauses = gimple_omp_for_clauses (stmt); + ctx = maybe_lookup_ctx (stmt); + t = NULL_TREE; + + /* The reduction clause may be nested inside a loop directive. + Scan for the innermost vector_length clause. */ + for (omp_context *oc = ctx; oc; oc = oc->outer) + { + tree c; + + switch (gimple_code (oc->stmt)) + { + case GIMPLE_OACC_PARALLEL: + c = gimple_oacc_parallel_clauses (oc->stmt); + break; + case GIMPLE_OMP_FOR: + c = gimple_omp_for_clauses (oc->stmt); + break; + default: + c = NULL_TREE; + break; + } + + if (c && gimple_code (oc->stmt) == GIMPLE_OACC_PARALLEL) + { + t = find_omp_clause (c, OMP_CLAUSE_VECTOR_LENGTH); + if (t) + t = fold_convert_loc (OMP_CLAUSE_LOCATION (t), + integer_type_node, + OMP_CLAUSE_VECTOR_LENGTH_EXPR (t)); + break; + } + } + + if (!t) + t = integer_one_node; + + /* Extract the number of threads. */ + nthreads = create_tmp_var (TREE_TYPE (t), NULL); + gimplify_assign (nthreads, t, in_stmt_seqp); + + /* Ensure nthreads >= 1. */ + stmt = gimple_build_assign_with_ops (MAX_EXPR, nthreads, nthreads, + fold_convert(TREE_TYPE (nthreads), + integer_one_node)); + gimple_seq_add_stmt (in_stmt_seqp, stmt); + + /* Set the number of threads. */ + /* FIXME: This needs to handle accelerators */ + call = builtin_decl_explicit (BUILT_IN_OMP_SET_NUM_THREADS); + stmt = gimple_build_call (call, 1, nthreads); + gimple_seq_add_stmt (in_stmt_seqp, stmt); + + initialize_reduction_data (clauses, nthreads, in_stmt_seqp, ctx); + finalize_reduction_data (clauses, nthreads, out_stmt_seqp, ctx); + break; + default: + // Scan for other directives which support reduction here. + break; + } + } +} + /* Routines to lower OpenMP directives into OMP-GIMPLE. */ /* Lower the OpenACC offload directive in the current statement @@ -9150,7 +9735,7 @@ lower_oacc_offload (gimple_stmt_iterator *gsi_p, omp_context *ctx) tree child_fn, t, c; gimple stmt = gsi_stmt (*gsi_p); gimple par_bind, bind; - gimple_seq par_body, olist, ilist, new_body; + gimple_seq par_body, olist, ilist, orlist, irlist, new_body; location_t loc = gimple_location (stmt); unsigned int map_cnt = 0; tree (*gimple_omp_clauses) (const_gimple); @@ -9176,6 +9761,10 @@ lower_oacc_offload (gimple_stmt_iterator *gsi_p, omp_context *ctx) push_gimplify_context (); + irlist = NULL; + orlist = NULL; + process_reduction_data (&par_body, &irlist, &orlist, ctx); + for (c = clauses; c ; c = OMP_CLAUSE_CHAIN (c)) switch (OMP_CLAUSE_CODE (c)) { @@ -9330,7 +9919,8 @@ lower_oacc_offload (gimple_stmt_iterator *gsi_p, omp_context *ctx) avar = build_fold_addr_expr (avar); gimplify_assign (x, avar, &ilist); } - else if (is_gimple_reg (var)) + else if (is_gimple_reg (var) + && !maybe_lookup_reduction (var, ctx)) { tree avar = create_tmp_var (TREE_TYPE (var), NULL); mark_addressable (avar); @@ -9355,7 +9945,8 @@ lower_oacc_offload (gimple_stmt_iterator *gsi_p, omp_context *ctx) } else { - var = build_fold_addr_expr (var); + if (!maybe_lookup_reduction (var, ctx)) + var = build_fold_addr_expr (var); gimplify_assign (x, var, &ilist); } } @@ -9439,9 +10030,11 @@ lower_oacc_offload (gimple_stmt_iterator *gsi_p, omp_context *ctx) bind = gimple_build_bind (NULL, NULL, gimple_bind_block (par_bind)); gsi_replace (gsi_p, bind, true); + gimple_bind_add_seq (bind, irlist); gimple_bind_add_seq (bind, ilist); gimple_bind_add_stmt (bind, stmt); gimple_bind_add_seq (bind, olist); + gimple_bind_add_seq (bind, orlist); pop_gimplify_context (NULL); } diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-1.c b/gcc/testsuite/c-c++-common/goacc/reduction-1.c new file mode 100644 index 0000000..cff7d2d --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-1.c @@ -0,0 +1,80 @@ +/* Integer reductions. */ + +#define vl 32 + +int +main(void) +{ + const int n = 1000; + int i; + int result, array[n]; + int lresult; + + /* '+' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) + result += array[i]; +#pragma acc end parallel + + /* '*' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (*:result) + for (i = 0; i < n; i++) + result *= array[i]; +#pragma acc end parallel + +// result = 0; +// vresult = 0; +// +// /* 'max' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result > array[i] ? result : array[i]; +// #pragma acc end parallel +// +// /* 'min' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result < array[i] ? result : array[i]; +// #pragma acc end parallel + + /* '&' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (&:result) + for (i = 0; i < n; i++) + result &= array[i]; +#pragma acc end parallel + + /* '|' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (|:result) + for (i = 0; i < n; i++) + result |= array[i]; +#pragma acc end parallel + + /* '^' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (^:result) + for (i = 0; i < n; i++) + result ^= array[i]; +#pragma acc end parallel + + /* '&&' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (&&:lresult) + for (i = 0; i < n; i++) + lresult = lresult && (result > array[i]); +#pragma acc end parallel + + /* '||' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (||:lresult) + for (i = 0; i < n; i++) + lresult = lresult || (result > array[i]); +#pragma acc end parallel + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-2.c b/gcc/testsuite/c-c++-common/goacc/reduction-2.c new file mode 100644 index 0000000..9686b37 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-2.c @@ -0,0 +1,56 @@ +/* float reductions. */ + +#define vl 32 + +int +main(void) +{ + const int n = 1000; + int i; + float result, array[n]; + int lresult; + + /* '+' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) + result += array[i]; +#pragma acc end parallel + + /* '*' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (*:result) + for (i = 0; i < n; i++) + result *= array[i]; +#pragma acc end parallel + +// /* 'max' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result > array[i] ? result : array[i]; +// #pragma acc end parallel +// +// /* 'min' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result < array[i] ? result : array[i]; +// #pragma acc end parallel + + /* '&&' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (&&:lresult) + for (i = 0; i < n; i++) + lresult = lresult && (result > array[i]); +#pragma acc end parallel + + /* '||' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (||:lresult) + for (i = 0; i < n; i++) + lresult = lresult || (result > array[i]); +#pragma acc end parallel + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-3.c b/gcc/testsuite/c-c++-common/goacc/reduction-3.c new file mode 100644 index 0000000..c618c4e --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-3.c @@ -0,0 +1,56 @@ +/* double reductions. */ + +#define vl 32 + +int +main(void) +{ + const int n = 1000; + int i; + double result, array[n]; + int lresult; + + /* '+' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) + result += array[i]; +#pragma acc end parallel + + /* '*' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (*:result) + for (i = 0; i < n; i++) + result *= array[i]; +#pragma acc end parallel + +// /* 'max' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result > array[i] ? result : array[i]; +// #pragma acc end parallel +// +// /* 'min' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result < array[i] ? result : array[i]; +// #pragma acc end parallel + + /* '&&' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (&&:lresult) + for (i = 0; i < n; i++) + lresult = lresult && (result > array[i]); +#pragma acc end parallel + + /* '||' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (||:lresult) + for (i = 0; i < n; i++) + lresult = lresult || (result > array[i]); +#pragma acc end parallel + + return 0; +} diff --git a/gcc/testsuite/c-c++-common/goacc/reduction-4.c b/gcc/testsuite/c-c++-common/goacc/reduction-4.c new file mode 100644 index 0000000..1e032a1 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/reduction-4.c @@ -0,0 +1,58 @@ +/* complex reductions. */ + +#define vl 32 + +int +main(void) +{ + const int n = 1000; + int i; + __complex__ double result, array[n]; + int lresult; + + /* '+' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (+:result) + for (i = 0; i < n; i++) + result += array[i]; +#pragma acc end parallel + + /* Needs support for complex multiplication. */ + +// /* '*' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (*:result) +// for (i = 0; i < n; i++) +// result *= array[i]; +// #pragma acc end parallel +// +// /* 'max' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result > array[i] ? result : array[i]; +// #pragma acc end parallel +// +// /* 'min' reductions. */ +// #pragma acc parallel vector_length (vl) +// #pragma acc loop reduction (+:result) +// for (i = 0; i < n; i++) +// result = result < array[i] ? result : array[i]; +// #pragma acc end parallel + + /* '&&' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (&&:lresult) + for (i = 0; i < n; i++) + lresult = lresult && (__real__(result) > __real__(array[i])); +#pragma acc end parallel + + /* '||' reductions. */ +#pragma acc parallel vector_length (vl) +#pragma acc loop reduction (||:lresult) + for (i = 0; i < n; i++) + lresult = lresult || (__real__(result) > __real__(array[i])); +#pragma acc end parallel + + return 0; +}