Hi Bernd, I discovered an issue in the LTO streaming out for target - currently any file (even without any pragma) compiled with -fopenmp/-fopenacc contains .gnu.target_lto_* sections. This increases the size of an object file and makes lto-wrapper to run mkoffload.
Therefore, I propose to replace the condition before ipa_write_summaries: - if (flag_openacc || flag_openmp) + if ((flag_openacc || flag_openmp) && !(vec_safe_is_empty (offload_funcs) && vec_safe_is_empty (offload_vars))) But to do this, the offload_vars must be filled before the check (offload_funcs is already filled in expand_omp_target). Here is the updated patch. Bootstrap passed. OK for gomp-4_0-branch? On 13 Aug 20:19, Ilya Verbin wrote: > Here is the updated patch. offload_funcs/vars are now declared in omp-low.h, > the functions have a comment. Also it fixes the issue of offload_funcs/vars > corruption by the garbage collector. OK for gomp-4_0-branch? --- gcc/Makefile.in | 1 + gcc/cgraphunit.c | 25 ++++++++++++- gcc/gengtype.c | 2 +- gcc/lto-cgraph.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++ gcc/lto-section-in.c | 3 +- gcc/lto-streamer-out.c | 2 + gcc/lto-streamer.h | 3 ++ gcc/lto/lto.c | 2 + gcc/omp-low.c | 74 ++++++++------------------------------ gcc/omp-low.h | 3 ++ 10 files changed, 147 insertions(+), 61 deletions(-) diff --git a/gcc/Makefile.in b/gcc/Makefile.in index bfa5f32..372f586 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -2290,6 +2290,7 @@ GTFILES = $(CPP_ID_DATA_H) $(srcdir)/input.h $(srcdir)/coretypes.h \ $(srcdir)/tree-profile.c $(srcdir)/tree-nested.c \ $(srcdir)/tree-parloops.c \ $(srcdir)/omp-low.c \ + $(srcdir)/omp-low.h \ $(srcdir)/targhooks.c $(out_file) $(srcdir)/passes.c $(srcdir)/cgraphunit.c \ $(srcdir)/cgraphclones.c \ $(srcdir)/tree-phinodes.c \ diff --git a/gcc/cgraphunit.c b/gcc/cgraphunit.c index 308c534..f0c9f5c 100644 --- a/gcc/cgraphunit.c +++ b/gcc/cgraphunit.c @@ -211,6 +211,7 @@ along with GCC; see the file COPYING3. If not see #include "tree-nested.h" #include "gimplify.h" #include "dbgcnt.h" +#include "omp-low.h" #include "lto-section-names.h" /* Queue of cgraph nodes scheduled to be added into cgraph. This is a @@ -2039,6 +2040,24 @@ output_in_order (void) free (nodes); } +/* Collect all global variables with "omp declare target" attribute into + OFFLOAD_VARS. It will be streamed out in ipa_write_summaries. */ + +static void +init_offload_var_table (void) +{ + struct varpool_node *vnode; + FOR_EACH_DEFINED_VARIABLE (vnode) + { + if (!lookup_attribute ("omp declare target", + DECL_ATTRIBUTES (vnode->decl)) + || TREE_CODE (vnode->decl) != VAR_DECL + || DECL_SIZE (vnode->decl) == 0) + continue; + vec_safe_push (offload_vars, vnode->decl); + } +} + static void ipa_passes (void) { @@ -2089,7 +2108,11 @@ ipa_passes (void) if (!in_lto_p) { - if (flag_openacc || flag_openmp) + init_offload_var_table (); + + if ((flag_openacc || flag_openmp) + && !(vec_safe_is_empty (offload_funcs) + && vec_safe_is_empty (offload_vars))) { section_name_prefix = OMP_SECTION_NAME_PREFIX; ipa_write_summaries (true); diff --git a/gcc/gengtype.c b/gcc/gengtype.c index ffe3f94..5bcbbe2 100644 --- a/gcc/gengtype.c +++ b/gcc/gengtype.c @@ -1800,7 +1800,7 @@ open_base_files (void) "tree-ssa.h", "reload.h", "cpp-id-data.h", "tree-chrec.h", "except.h", "output.h", "cfgloop.h", "target.h", "ipa-prop.h", "lto-streamer.h", "target-globals.h", - "ipa-inline.h", "dwarf2out.h", NULL + "ipa-inline.h", "dwarf2out.h", "omp-low.h", NULL }; const char *const *ifp; outf_p gtype_desc_c; diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c index bc05400..8fb7078 100644 --- a/gcc/lto-cgraph.c +++ b/gcc/lto-cgraph.c @@ -52,6 +52,7 @@ along with GCC; see the file COPYING3. If not see #include "context.h" #include "pass_manager.h" #include "ipa-utils.h" +#include "omp-low.h" /* True when asm nodes has been output. */ bool asm_nodes_output = false; @@ -1044,6 +1045,49 @@ read_string (struct lto_input_block *ib) return str; } +/* Output function/variable tables that will allow libgomp to look up offload + target code. OFFLOAD_FUNCS is filled in expand_omp_target, OFFLOAD_VARS is + filled in ipa_passes. In WHOPR (partitioned) mode during the WPA stage both + OFFLOAD_FUNCS and OFFLOAD_VARS are filled by input_offload_tables. */ + +void +output_offload_tables (void) +{ + if (vec_safe_is_empty (offload_funcs) && vec_safe_is_empty (offload_vars)) + return; + + struct lto_simple_output_block *ob + = lto_create_simple_output_block (LTO_section_offload_table); + + for (unsigned i = 0; i < vec_safe_length (offload_funcs); i++) + { + streamer_write_enum (ob->main_stream, LTO_symtab_tags, + LTO_symtab_last_tag, LTO_symtab_unavail_node); + lto_output_fn_decl_index (ob->decl_state, ob->main_stream, + (*offload_funcs)[i]); + } + + for (unsigned i = 0; i < vec_safe_length (offload_vars); i++) + { + streamer_write_enum (ob->main_stream, LTO_symtab_tags, + LTO_symtab_last_tag, LTO_symtab_variable); + lto_output_var_decl_index (ob->decl_state, ob->main_stream, + (*offload_vars)[i]); + } + + streamer_write_uhwi_stream (ob->main_stream, 0); + lto_destroy_simple_output_block (ob); + + /* In WHOPR mode during the WPA stage the joint offload tables need to be + streamed to one partition only. That's why we free offload_funcs and + offload_vars after the first call of output_offload_tables. */ + if (flag_wpa) + { + vec_free (offload_funcs); + vec_free (offload_vars); + } +} + /* Overwrite the information in NODE based on FILE_DATA, TAG, FLAGS, STACK_SIZE, SELF_TIME and SELF_SIZE. This is called either to initialize NODE or to replace the values in it, for instance because the first @@ -1739,6 +1783,55 @@ input_symtab (void) } } +/* Input function/variable tables that will allow libgomp to look up offload + target code, and store them into OFFLOAD_FUNCS and OFFLOAD_VARS. */ + +void +input_offload_tables (void) +{ + struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data (); + struct lto_file_decl_data *file_data; + unsigned int j = 0; + + while ((file_data = file_data_vec[j++])) + { + const char *data; + size_t len; + struct lto_input_block *ib + = lto_create_simple_input_block (file_data, LTO_section_offload_table, + &data, &len); + if (!ib) + continue; + + enum LTO_symtab_tags tag + = streamer_read_enum (ib, LTO_symtab_tags, LTO_symtab_last_tag); + while (tag) + { + if (tag == LTO_symtab_unavail_node) + { + int decl_index = streamer_read_uhwi (ib); + tree fn_decl + = lto_file_decl_data_get_fn_decl (file_data, decl_index); + vec_safe_push (offload_funcs, fn_decl); + } + else if (tag == LTO_symtab_variable) + { + int decl_index = streamer_read_uhwi (ib); + tree var_decl + = lto_file_decl_data_get_var_decl (file_data, decl_index); + vec_safe_push (offload_vars, var_decl); + } + else + fatal_error ("invalid offload table in %s", file_data->file_name); + + tag = streamer_read_enum (ib, LTO_symtab_tags, LTO_symtab_last_tag); + } + + lto_destroy_simple_input_block (file_data, LTO_section_offload_table, + ib, data, len); + } +} + /* True when we need optimization summary for NODE. */ static int diff --git a/gcc/lto-section-in.c b/gcc/lto-section-in.c index d887763..b705c75 100644 --- a/gcc/lto-section-in.c +++ b/gcc/lto-section-in.c @@ -60,7 +60,8 @@ const char *lto_section_name[LTO_N_SECTION_TYPES] = "opts", "cgraphopt", "inline", - "ipcp_trans" + "ipcp_trans", + "offload_table" }; diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c index 3064562..ff8572d 100644 --- a/gcc/lto-streamer-out.c +++ b/gcc/lto-streamer-out.c @@ -2108,6 +2108,8 @@ lto_output (void) statements using the statement UIDs. */ output_symtab (); + output_offload_tables (); + #ifdef ENABLE_CHECKING lto_bitmap_free (output); #endif diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h index eedec95..3607634 100644 --- a/gcc/lto-streamer.h +++ b/gcc/lto-streamer.h @@ -248,6 +248,7 @@ enum lto_section_type LTO_section_cgraph_opt_sum, LTO_section_inline_summary, LTO_section_ipcp_transform, + LTO_section_offload_table, LTO_N_SECTION_TYPES /* Must be last. */ }; @@ -884,6 +885,8 @@ bool lto_symtab_encoder_encode_initializer_p (lto_symtab_encoder_t, varpool_node *); void output_symtab (void); void input_symtab (void); +void output_offload_tables (void); +void input_offload_tables (void); bool referenced_from_other_partition_p (struct ipa_ref_list *, lto_symtab_encoder_t); bool reachable_from_other_partition_p (struct cgraph_node *, diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c index 28c896d..a0b606c 100644 --- a/gcc/lto/lto.c +++ b/gcc/lto/lto.c @@ -3015,6 +3015,8 @@ read_cgraph_and_symbols (unsigned nfiles, const char **fnames) /* Read the symtab. */ input_symtab (); + input_offload_tables (); + /* Store resolutions into the symbol table. */ FOR_EACH_SYMBOL (snode) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index ce97a0e..1ad98ab 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -240,6 +240,9 @@ omp_get_id (tree node) /* Holds a decl for __OPENMP_TARGET__. */ static GTY(()) tree offload_symbol_decl; +/* Holds offload tables with decls. */ +vec<tree, va_gc> *offload_funcs, *offload_vars; + /* Get the __OPENMP_TARGET__ symbol. */ static tree get_offload_symbol_decl (void) @@ -8906,6 +8909,9 @@ expand_omp_target (struct omp_region *region) DECL_STRUCT_FUNCTION (child_fn)->curr_properties = cfun->curr_properties; cgraph_add_new_function (child_fn, true); + /* Add the new function to the offload table. */ + vec_safe_push (offload_funcs, child_fn); + /* Fix the callgraph edges for child_cfun. Those for cfun will be fixed in a following pass. */ push_cfun (child_cfun); @@ -13730,71 +13736,23 @@ add_decls_addresses_to_decl_constructor (vec<tree, va_gc> *v_decls, void omp_finish_file (void) { - struct cgraph_node *node; - struct varpool_node *vnode; const char *funcs_section_name = OFFLOAD_FUNC_TABLE_SECTION_NAME; const char *vars_section_name = OFFLOAD_VAR_TABLE_SECTION_NAME; - vec<tree, va_gc> *v_funcs, *v_vars; - - vec_alloc (v_vars, 0); - vec_alloc (v_funcs, 0); - - /* Collect all omp-target functions. */ - FOR_EACH_DEFINED_FUNCTION (node) - { - /* TODO: This check could fail on functions, created by omp - parallel/task pragmas. It's better to name outlined for offloading - functions in some different way and to check here the function name. - It could be something like "*_omp_tgtfn" in contrast with "*_omp_fn" - for functions from omp parallel/task pragmas. */ - if (!lookup_attribute ("omp declare target", - DECL_ATTRIBUTES (node->decl)) - || !DECL_ARTIFICIAL (node->decl)) - continue; - vec_safe_push (v_funcs, node->decl); - } - /* Collect all omp-target global variables. */ - FOR_EACH_DEFINED_VARIABLE (vnode) - { - if (!lookup_attribute ("omp declare target", - DECL_ATTRIBUTES (vnode->decl)) - || TREE_CODE (vnode->decl) != VAR_DECL - || DECL_SIZE (vnode->decl) == 0) - continue; - vec_safe_push (v_vars, vnode->decl); - } - unsigned num_vars = vec_safe_length (v_vars); - unsigned num_funcs = vec_safe_length (v_funcs); + unsigned num_funcs = vec_safe_length (offload_funcs); + unsigned num_vars = vec_safe_length (offload_vars); - if (num_vars == 0 && num_funcs == 0) + if (num_funcs == 0 && num_vars == 0) return; -#ifdef ACCEL_COMPILER - /* Decls are placed in reversed order in fat-objects, so we need to - revert them back if we compile target. */ - for (unsigned i = 0; i < num_funcs / 2; i++) - { - tree it = (*v_funcs)[i]; - (*v_funcs)[i] = (*v_funcs)[num_funcs - i - 1]; - (*v_funcs)[num_funcs - i - 1] = it; - } - for (unsigned i = 0; i < num_vars / 2; i++) - { - tree it = (*v_vars)[i]; - (*v_vars)[i] = (*v_vars)[num_vars - i - 1]; - (*v_vars)[num_vars - i - 1] = it; - } -#endif - if (targetm_common.have_named_sections) { vec<constructor_elt, va_gc> *v_f, *v_v; vec_alloc (v_f, num_funcs); vec_alloc (v_v, num_vars * 2); - add_decls_addresses_to_decl_constructor (v_funcs, v_f); - add_decls_addresses_to_decl_constructor (v_vars, v_v); + add_decls_addresses_to_decl_constructor (offload_funcs, v_f); + add_decls_addresses_to_decl_constructor (offload_vars, v_v); tree vars_decl_type = build_array_type_nelts (pointer_sized_int_node, num_vars * 2); @@ -13817,7 +13775,7 @@ omp_finish_file (void) DECL_INITIAL (vars_decl) = ctor_v; set_decl_section_name (funcs_decl, funcs_section_name); set_decl_section_name (vars_decl, vars_section_name); - + varpool_assemble_decl (varpool_node_for_decl (vars_decl)); varpool_assemble_decl (varpool_node_for_decl (funcs_decl)); } @@ -13825,14 +13783,14 @@ omp_finish_file (void) { for (unsigned i = 0; i < num_funcs; i++) { - tree it = (*v_funcs)[i]; + tree it = (*offload_funcs)[i]; targetm.record_offload_symbol (it); - } + } for (unsigned i = 0; i < num_vars; i++) { - tree it = (*v_vars)[i]; + tree it = (*offload_vars)[i]; targetm.record_offload_symbol (it); - } + } } } diff --git a/gcc/omp-low.h b/gcc/omp-low.h index f904eda..ac587d0 100644 --- a/gcc/omp-low.h +++ b/gcc/omp-low.h @@ -29,4 +29,7 @@ extern tree omp_reduction_init (tree, tree); extern bool make_gimple_omp_edges (basic_block, struct omp_region **, int *); extern void omp_finish_file (void); +extern GTY(()) vec<tree, va_gc> *offload_funcs; +extern GTY(()) vec<tree, va_gc> *offload_vars; + #endif /* GCC_OMP_LOW_H */ -- 1.7.1 Thanks, -- Ilya