On Mon, Dec 07, 2015 at 12:22:43PM +0100, Martin Jambor wrote: > it creates a copy of the entire target body and expands it slightly > differently for concurrent execution on a GPU. Note that both teams > and distribute constructs are mandatory. Moreover, currently the > distribute has to be in a combined statement with the inner for > construct. And there are quite a few other restrictions which I hope
The standard calls those composite constructs, and I bet for gridification you want that restriction always, without composite distribute parallel for there are two different unrelated loops. > * builtin-types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New. > (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed. > (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New. > * fortran/types.def (BT_FN_VOID_UINT_PTR_INT_PTR): New. > (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT): Removed. > (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR): New. Fortran has its own ChangeLog file. > @@ -556,9 +558,9 @@ DEF_FUNCTION_TYPE_9 > (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT, > BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG, > BT_BOOL, BT_UINT, BT_PTR, BT_INT) > > -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT, > - BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR, > - BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT) > +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR, > + BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR, > + BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR) There shouldn't be an empty line in between this DEF_FUNCTION_TYPE_9 and the previous one. > @@ -221,9 +223,9 @@ DEF_FUNCTION_TYPE_9 > (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_BOOL_UINT_PTR_INT, > BT_PTR_FN_VOID_PTR_PTR, BT_LONG, BT_LONG, > BT_BOOL, BT_UINT, BT_PTR, BT_INT) > > -DEF_FUNCTION_TYPE_10 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_INT_INT, > +DEF_FUNCTION_TYPE_9 (BT_FN_VOID_INT_OMPFN_SIZE_PTR_PTR_PTR_UINT_PTR_PTR, > BT_VOID, BT_INT, BT_PTR_FN_VOID_PTR, BT_SIZE, BT_PTR, > - BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_INT, BT_INT) > + BT_PTR, BT_PTR, BT_UINT, BT_PTR, BT_PTR) > > DEF_FUNCTION_TYPE_11 > (BT_FN_VOID_OMPFN_PTR_OMPCPYFN_LONG_LONG_UINT_LONG_INT_LONG_LONG_LONG, > BT_VOID, BT_PTR_FN_VOID_PTR, BT_PTR, Ditto. > --- a/gcc/gimple.def > +++ b/gcc/gimple.def > @@ -369,13 +369,17 @@ DEFGSCODE(GIMPLE_OMP_TARGET, "gimple_omp_target", > GSS_OMP_PARALLEL_LAYOUT) > /* GIMPLE_OMP_TEAMS <BODY, CLAUSES> represents #pragma omp teams > BODY is the sequence of statements inside the single section. > CLAUSES is an OMP_CLAUSE chain holding the associated clauses. */ > -DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_SINGLE_LAYOUT) > +DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_TEAMS_LAYOUT) Why? > +/* GIMPLE_OMP_GPUKERNEL <BODY> represents a parallel loop lowered for > execution > + on a GPU. It is an artificial statement created by omp lowering. */ > +DEFGSCODE(GIMPLE_OMP_GPUKERNEL, "gimple_omp_gpukernel", GSS_OMP) Why do you call it GPUKERNEL or KERNEL_BODY when you really mean gridified body and gridified loop? I mean, what is GPU specific about it? PTX is unlikely going to use that. And kernel is a wide term. > @@ -622,8 +623,14 @@ struct GTY((tag("GSS_OMP_FOR"))) > /* [ WORD 11 ] > Pre-body evaluated before the loop body begins. */ > gimple_seq pre_body; > + > + /* [ WORD 12 ] > + If set, this statement is part of a gridified kernel, its clauses need > to > + be scanned and lowered but the statement should be discarded after > + lowering. */ > + bool kernel_phony; Ugh no, flags should go into GF_OMP_*. > @@ -643,6 +660,12 @@ struct GTY((tag("GSS_OMP_PARALLEL_LAYOUT"))) > /* [ WORD 10 ] > Shared data argument. */ > tree data_arg; > + > + /* [ WORD 11 ] */ > + /* If set, this statement is part of a gridified kernel, its clauses need > to > + be scanned and lowered but the statement should be discarded after > + lowering. */ > + bool kernel_phony; > }; Likewise. As for omp-low.c changes, the file is already large enough that it would be nice if it is easy to find out what routines are for gridification purposes only, use some special prefix (grid_*, ompgrid_*, ...) for all such functions? > @@ -1761,6 +1786,8 @@ fixup_child_record_type (omp_context *ctx) > { > tree f, type = ctx->record_type; > > + if (!ctx->receiver_decl) > + return; So when is receiver_decl NULL? > @@ -2113,6 +2140,14 @@ scan_sharing_clauses (tree clauses, omp_context *ctx) > } > break; > > + case OMP_CLAUSE__GRIDDIM_: > + if (ctx->outer) > + { > + scan_omp_op (&OMP_CLAUSE_GRIDDIM_SIZE (c), ctx->outer); > + scan_omp_op (&OMP_CLAUSE_GRIDDIM_GROUP (c), ctx->outer); These should be OMP_CLAUSE__GRIDDIM__{SIZE,GROUP}. See OMP_CLAUSE__SIMDUID__DECL for another similar macro. > @@ -6252,6 +6302,37 @@ gimple_build_cond_empty (tree cond) > return gimple_build_cond (pred_code, lhs, rhs, NULL_TREE, NULL_TREE); > } > > +/* Return true if a parallel REGION is within a declare target function or > + within a target region and is not a part of a gridified kernel. */ > + > +static bool > +region_needs_kernel_p (struct omp_region *region) > +{ > + bool indirect = false; > + for (region = region->outer; region; region = region->outer) > + { > + if (region->type == GIMPLE_OMP_PARALLEL) > + indirect = true; > + else if (region->type == GIMPLE_OMP_TARGET) > + { > + gomp_target *tgt_stmt; > + tgt_stmt = as_a <gomp_target *> (last_stmt (region->entry)); gomp_target *tgt_stmt = as_a <gomp_target *> (last_stmt (region->entry)); ? > +static GTY(()) tree kernel_dim_array_type; > +static GTY(()) tree kernel_lattrs_dimnum_decl; > +static GTY(()) tree kernel_lattrs_grid_decl; > +static GTY(()) tree kernel_lattrs_group_decl; > +static GTY(()) tree kernel_launch_attributes_type; Turn this at least into either a struct or array of trees, so that it is not 5 separate GC roots? > + tree dim_arr_index_type; > + dim_arr_index_type = build_index_type (build_int_cst (integer_type_node, > 2)); See above for formatting; even if you don't have the declaration one line above it, putting = in 5th column of next line will be often beneficial for the formatting: > + kernel_dim_array_type = build_array_type (uint32_type_node, > + dim_arr_index_type); > + > + kernel_launch_attributes_type = make_node (RECORD_TYPE); > + kernel_lattrs_dimnum_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL, > + get_identifier ("ndim"), > + uint32_type_node); > + DECL_CHAIN (kernel_lattrs_dimnum_decl) = NULL_TREE; > + > + kernel_lattrs_grid_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL, > + get_identifier ("grid_size"), > + kernel_dim_array_type); > + DECL_CHAIN (kernel_lattrs_grid_decl) = kernel_lattrs_dimnum_decl; > + kernel_lattrs_group_decl = build_decl (BUILTINS_LOCATION, FIELD_DECL, > + get_identifier ("group_size"), > + kernel_dim_array_type); > + DECL_CHAIN (kernel_lattrs_group_decl) = kernel_lattrs_grid_decl; > + finish_builtin_struct (kernel_launch_attributes_type, > + "__gomp_kernel_launch_attributes", > + kernel_lattrs_group_decl, NULL_TREE); > +static tree > +get_target_arguments (gimple_stmt_iterator *gsi, gomp_target *tgt_stmt) > +{ > + auto_vec <tree, 4> args; > + tree clauses = gimple_omp_target_clauses (tgt_stmt); > + tree t, c = find_omp_clause (clauses, OMP_CLAUSE_NUM_TEAMS); > + if (c) > + t = OMP_CLAUSE_NUM_TEAMS_EXPR (c); > + else > + t = integer_minus_one_node; > + t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL, > + GOMP_TARGET_ARG_NUM_TEAMS, t); > + args.quick_push (t); This is what I've talked about in review of another patch. num_teams is int, for 32-bit targets trying to encode it into 16 bits is not going to work. > + > + c = find_omp_clause (clauses, OMP_CLAUSE_THREAD_LIMIT); > + if (c) > + t = OMP_CLAUSE_THREAD_LIMIT_EXPR (c); > + else > + t = integer_minus_one_node; > + t = get_target_argument_value (gsi, GOMP_TARGET_ARG_DEVICE_ALL, > + GOMP_TARGET_ARG_THREAD_LIMIT, t); Ditto. > @@ -14872,6 +15392,14 @@ lower_omp_taskreg (gimple_stmt_iterator *gsi_p, > omp_context *ctx) > par_olist = NULL; > par_ilist = NULL; > par_rlist = NULL; > + bool phony_construct = is_a <gomp_parallel *> (stmt) > + && gimple_omp_parallel_kernel_phony (as_a <gomp_parallel *> (stmt)); I'm not a big fan of the is_a mess. gimple_code (stmt) == GIMPLE_OMP_PARALLEL is what is used elsewhere. > + if (phony_construct && ctx->record_type) > + { > + gcc_checking_assert (!ctx->receiver_decl); > + ctx->receiver_decl = create_tmp_var > + (build_reference_type (ctx->record_type), ".omp_rec"); Formatting. > @@ -400,7 +401,8 @@ const char * const omp_clause_code_name[] = > "num_gangs", > "num_workers", > "vector_length", > - "tile" > + "tile", > + "griddim" The clause is "_griddim_". Jakub