> -----Original Message----- > From: Richard Biener <rguent...@suse.de> > Sent: Tuesday, August 20, 2024 10:36 AM > To: Richard Sandiford <richard.sandif...@arm.com> > Cc: Prathamesh Kulkarni <prathame...@nvidia.com>; Thomas Schwinge > <tschwi...@baylibre.com>; gcc-patches@gcc.gnu.org > Subject: Re: Re-compute TYPE_MODE and DECL_MODE while streaming in for > accelerator > > External email: Use caution opening links or attachments > > > > Am 19.08.2024 um 20:56 schrieb Richard Sandiford > <richard.sandif...@arm.com>: > > > > Prathamesh Kulkarni <prathame...@nvidia.com> writes: > >> diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc index > >> cbf6041fd68..0420183faf8 100644 > >> --- a/gcc/lto-streamer-in.cc > >> +++ b/gcc/lto-streamer-in.cc > >> @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3. If not > see > >> #include "debug.h" > >> #include "alloc-pool.h" > >> #include "toplev.h" > >> +#include "stor-layout.h" > >> > >> /* Allocator used to hold string slot entries for line map > streaming. > >> */ static struct object_allocator<struct string_slot> > >> *string_slot_allocator; @@ -1752,6 +1753,17 @@ lto_read_tree_1 > (class lto_input_block *ib, class data_in *data_in, tree expr) > >> with -g1, see for example PR113488. */ > >> else if (DECL_P (expr) && DECL_ABSTRACT_ORIGIN (expr) == > expr) > >> DECL_ABSTRACT_ORIGIN (expr) = NULL_TREE; > >> + > >> +#ifdef ACCEL_COMPILER > >> + /* For decl with aggregate type, host streams out VOIDmode. > >> + Compute the correct DECL_MODE by calling relayout_decl. */ > >> + if ((VAR_P (expr) > >> + || TREE_CODE (expr) == PARM_DECL > >> + || TREE_CODE (expr) == FIELD_DECL) > >> + && AGGREGATE_TYPE_P (TREE_TYPE (expr)) > >> + && DECL_MODE (expr) == VOIDmode) > >> + relayout_decl (expr); > >> +#endif > > > > Genuine question, but: is relayout_decl safe in this context? It > does > > a lot more than just reset the mode. It also applies the target > ABI's > > preferences wrt alignment, padding, and so on, rather than > preserving > > those of the host's. > > It would be better to just recompute the mode here. Hi, The attached patch sets DECL_MODE (expr) to TYPE_MODE (TREE_TYPE (expr)) in lto_read_tree_1 instead of calling relayout_decl (expr). I checked layout_decl_type does the same thing for setting decl mode, except for bit fields. Since bit-fields cannot have aggregate type, I am assuming setting DECL_MODE (expr) to TYPE_MODE (TREE_TYPE (expr)) would be OK in this case ?
Sorry if this sounds like a silly ques -- Why would it be unsafe to call relayout_decl for variables that are mapped to accelerator even if it'd not preserve host's properties ? I assumed we want to assign accel's ABI properties for mapped decls (mode being one of them), or am I misunderstanding ? Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com> Thanks, Prathamesh > > Richard > > > Thanks, > > Richard > > > > > >> } > >> } > >> > >> diff --git a/gcc/stor-layout.cc b/gcc/stor-layout.cc index > >> 10c0809914c..0ff8bd1171e 100644 > >> --- a/gcc/stor-layout.cc > >> +++ b/gcc/stor-layout.cc > >> @@ -2396,6 +2396,32 @@ finish_builtin_struct (tree type, const char > *name, tree fields, > >> layout_decl (TYPE_NAME (type), 0); > >> } > >> > >> +/* Compute TYPE_MODE for TYPE (which is ARRAY_TYPE). */ > >> + > >> +void compute_array_mode (tree type) > >> +{ > >> + gcc_assert (TREE_CODE (type) == ARRAY_TYPE); > >> + > >> + SET_TYPE_MODE (type, BLKmode); > >> + if (TYPE_SIZE (type) != 0 > >> + && ! targetm.member_type_forces_blk (type, VOIDmode) > >> + /* BLKmode elements force BLKmode aggregate; > >> + else extract/store fields may lose. */ > >> + && (TYPE_MODE (TREE_TYPE (type)) != BLKmode > >> + || TYPE_NO_FORCE_BLK (TREE_TYPE (type)))) > >> + { > >> + SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type), > >> + TYPE_SIZE (type))); > >> + if (TYPE_MODE (type) != BLKmode > >> + && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT > >> + && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE > (type))) > >> + { > >> + TYPE_NO_FORCE_BLK (type) = 1; > >> + SET_TYPE_MODE (type, BLKmode); > >> + } > >> + } > >> +} > >> + > >> /* Calculate the mode, size, and alignment for TYPE. > >> For an array type, calculate the element separation as well. > >> Record TYPE on the chain of permanent or temporary types @@ > >> -2709,24 +2735,7 @@ layout_type (tree type) > >> align = MAX (align, BITS_PER_UNIT); #endif > >> SET_TYPE_ALIGN (type, align); > >> - SET_TYPE_MODE (type, BLKmode); > >> - if (TYPE_SIZE (type) != 0 > >> - && ! targetm.member_type_forces_blk (type, VOIDmode) > >> - /* BLKmode elements force BLKmode aggregate; > >> - else extract/store fields may lose. */ > >> - && (TYPE_MODE (TREE_TYPE (type)) != BLKmode > >> - || TYPE_NO_FORCE_BLK (TREE_TYPE (type)))) > >> - { > >> - SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type), > >> - TYPE_SIZE (type))); > >> - if (TYPE_MODE (type) != BLKmode > >> - && STRICT_ALIGNMENT && TYPE_ALIGN (type) < > BIGGEST_ALIGNMENT > >> - && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE > (type))) > >> - { > >> - TYPE_NO_FORCE_BLK (type) = 1; > >> - SET_TYPE_MODE (type, BLKmode); > >> - } > >> - } > >> + compute_array_mode (type); > >> if (AGGREGATE_TYPE_P (element)) > >> TYPE_TYPELESS_STORAGE (type) = TYPE_TYPELESS_STORAGE > (element); > >> /* When the element size is constant, check that it is at least > as > >> diff --git a/gcc/stor-layout.h b/gcc/stor-layout.h index > >> 096ca811762..9d9b8c385f6 100644 > >> --- a/gcc/stor-layout.h > >> +++ b/gcc/stor-layout.h > >> @@ -34,6 +34,7 @@ extern tree rli_size_so_far (record_layout_info); > >> extern void normalize_rli (record_layout_info); extern void > >> place_field (record_layout_info, tree); extern void > >> compute_record_mode (tree); > >> +extern void compute_array_mode (tree); > >> extern void finish_bitfield_layout (tree); extern void > >> finish_record_layout (record_layout_info, int); extern void > >> finalize_size_functions (void); diff --git a/gcc/tree-streamer- > in.cc > >> b/gcc/tree-streamer-in.cc index 40029437199..329d218e7d4 100644 > >> --- a/gcc/tree-streamer-in.cc > >> +++ b/gcc/tree-streamer-in.cc > >> @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3. If not > see > >> #include "attribs.h" > >> #include "asan.h" > >> #include "opts.h" > >> +#include "stor-layout.h" > >> > >> > >> /* Read a STRING_CST from the string table in DATA_IN using input > @@ > >> -395,6 +396,17 @@ unpack_ts_type_common_value_fields (struct > >> bitpack_d *bp, tree expr) #ifdef ACCEL_COMPILER > >> if (TYPE_ALIGN (expr) > targetm.absolute_biggest_alignment) > >> SET_TYPE_ALIGN (expr, targetm.absolute_biggest_alignment); > >> + > >> + /* Host streams out VOIDmode for aggregate type. */ if > >> + (AGGREGATE_TYPE_P (expr) && TYPE_MODE (expr) == VOIDmode) > >> + { > >> + if (TREE_CODE (expr) == ARRAY_TYPE) > >> + compute_array_mode (expr); > >> + else if (RECORD_OR_UNION_TYPE_P (expr)) > >> + compute_record_mode (expr); > >> + else > >> + gcc_unreachable (); > >> + } > >> #endif > >> } > >> > >> diff --git a/gcc/tree-streamer-out.cc b/gcc/tree-streamer-out.cc > >> index b7205287ffb..7de4447a1b5 100644 > >> --- a/gcc/tree-streamer-out.cc > >> +++ b/gcc/tree-streamer-out.cc > >> @@ -187,7 +187,17 @@ pack_ts_fixed_cst_value_fields (struct > bitpack_d > >> *bp, tree expr) static void pack_ts_decl_common_value_fields > (struct > >> bitpack_d *bp, tree expr) { > >> - bp_pack_machine_mode (bp, DECL_MODE (expr)); > >> + /* Similar to TYPE_MODE, avoid streaming out host-specific > DECL_MODE > >> + for aggregate type with offloading enabled, and while > streaming-in > >> + recompute appropriate DECL_MODE for accelerator. */ if > >> + (lto_stream_offload_p > >> + && (VAR_P (expr) > >> + || TREE_CODE (expr) == PARM_DECL > >> + || TREE_CODE (expr) == FIELD_DECL) > >> + && AGGREGATE_TYPE_P (TREE_TYPE (expr))) > >> + bp_pack_machine_mode (bp, VOIDmode); else > >> + bp_pack_machine_mode (bp, DECL_MODE (expr)); > >> bp_pack_value (bp, DECL_NONLOCAL (expr), 1); > >> bp_pack_value (bp, DECL_VIRTUAL_P (expr), 1); > >> bp_pack_value (bp, DECL_IGNORED_P (expr), 1); @@ -317,10 +327,18 > @@ > >> pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree > expr) > >> static void pack_ts_type_common_value_fields (struct bitpack_d *bp, > >> tree expr) { > >> + /* For offloading, avoid streaming out TYPE_MODE for aggregate > type since > >> + it may be host-specific. For eg, aarch64 uses OImode for > ARRAY_TYPE > >> + whose size is 256-bits, which is not representable on > accelerator. > >> + Instead stream out VOIDmode, and while streaming-in, > recompute > >> + appropriate TYPE_MODE for accelerator. */ if > >> + (lto_stream_offload_p && AGGREGATE_TYPE_P (expr)) > >> + bp_pack_machine_mode (bp, VOIDmode); > >> /* for VECTOR_TYPE, TYPE_MODE reevaluates the mode using > target_flags > >> not necessary valid in a global context. > >> Use the raw value previously set by layout_type. */ > >> - bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr)); > >> + else > >> + bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr)); > >> /* TYPE_NO_FORCE_BLK is private to stor-layout and need > >> no streaming. */ > >> bp_pack_value (bp, TYPE_PACKED (expr), 1);
Recompute TYPE_MODE and DECL_MODE for aggregate type for acclerator. The patch streams out VOIDmode for aggregate types with offloading enabled, and recomputes appropriate TYPE_MODE and DECL_MODE while streaming-in on accel side. The rationale for this change is to avoid streaming out host-specific modes that may be used for aggregate types, which may not be representable on the accelerator. For eg, AArch64 uses OImode for ARRAY_TYPE whose size is 256-bits, and nvptx doesn't have OImode, and thus ends up emitting an error from lto_input_mode_table. gcc/ChangeLog: * lto-streamer-in.cc: (lto_read_tree_1): Set DECL_MODE (expr) to TREE_TYPE (TYPE_MODE (expr)) if TREE_TYPE (expr) is aggregate type and offloading is enabled. * stor-layout.cc (layout_type): Move computation of mode for ARRAY_TYPE from ... (compute_array_mode): ... to here. * stor-layout.h (compute_array_mode): Declare. * tree-streamer-in.cc: Include stor-layout.h. (unpack_ts_common_value_fields): Call compute_array_mode if offloading is enabled. * tree-streamer-out.cc (pack_ts_fixed_cst_value_fields): Stream out VOIDmode if decl has aggregate type and offloading is enabled. (pack_ts_type_common_value_fields): Stream out VOIDmode for aggregate type if offloading is enabled. Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com> diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc index cbf6041fd68..64f75807328 100644 --- a/gcc/lto-streamer-in.cc +++ b/gcc/lto-streamer-in.cc @@ -1752,6 +1752,15 @@ lto_read_tree_1 (class lto_input_block *ib, class data_in *data_in, tree expr) with -g1, see for example PR113488. */ else if (DECL_P (expr) && DECL_ABSTRACT_ORIGIN (expr) == expr) DECL_ABSTRACT_ORIGIN (expr) = NULL_TREE; + +#ifdef ACCEL_COMPILER + if ((VAR_P (expr) + || TREE_CODE (expr) == PARM_DECL + || TREE_CODE (expr) == FIELD_DECL) + && AGGREGATE_TYPE_P (TREE_TYPE (expr)) + && DECL_MODE (expr) == VOIDmode) + SET_DECL_MODE (expr, TYPE_MODE (TREE_TYPE (expr))); +#endif } } diff --git a/gcc/stor-layout.cc b/gcc/stor-layout.cc index 10c0809914c..0ff8bd1171e 100644 --- a/gcc/stor-layout.cc +++ b/gcc/stor-layout.cc @@ -2396,6 +2396,32 @@ finish_builtin_struct (tree type, const char *name, tree fields, layout_decl (TYPE_NAME (type), 0); } +/* Compute TYPE_MODE for TYPE (which is ARRAY_TYPE). */ + +void compute_array_mode (tree type) +{ + gcc_assert (TREE_CODE (type) == ARRAY_TYPE); + + SET_TYPE_MODE (type, BLKmode); + if (TYPE_SIZE (type) != 0 + && ! targetm.member_type_forces_blk (type, VOIDmode) + /* BLKmode elements force BLKmode aggregate; + else extract/store fields may lose. */ + && (TYPE_MODE (TREE_TYPE (type)) != BLKmode + || TYPE_NO_FORCE_BLK (TREE_TYPE (type)))) + { + SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type), + TYPE_SIZE (type))); + if (TYPE_MODE (type) != BLKmode + && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT + && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type))) + { + TYPE_NO_FORCE_BLK (type) = 1; + SET_TYPE_MODE (type, BLKmode); + } + } +} + /* Calculate the mode, size, and alignment for TYPE. For an array type, calculate the element separation as well. Record TYPE on the chain of permanent or temporary types @@ -2709,24 +2735,7 @@ layout_type (tree type) align = MAX (align, BITS_PER_UNIT); #endif SET_TYPE_ALIGN (type, align); - SET_TYPE_MODE (type, BLKmode); - if (TYPE_SIZE (type) != 0 - && ! targetm.member_type_forces_blk (type, VOIDmode) - /* BLKmode elements force BLKmode aggregate; - else extract/store fields may lose. */ - && (TYPE_MODE (TREE_TYPE (type)) != BLKmode - || TYPE_NO_FORCE_BLK (TREE_TYPE (type)))) - { - SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type), - TYPE_SIZE (type))); - if (TYPE_MODE (type) != BLKmode - && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT - && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type))) - { - TYPE_NO_FORCE_BLK (type) = 1; - SET_TYPE_MODE (type, BLKmode); - } - } + compute_array_mode (type); if (AGGREGATE_TYPE_P (element)) TYPE_TYPELESS_STORAGE (type) = TYPE_TYPELESS_STORAGE (element); /* When the element size is constant, check that it is at least as diff --git a/gcc/stor-layout.h b/gcc/stor-layout.h index 096ca811762..9d9b8c385f6 100644 --- a/gcc/stor-layout.h +++ b/gcc/stor-layout.h @@ -34,6 +34,7 @@ extern tree rli_size_so_far (record_layout_info); extern void normalize_rli (record_layout_info); extern void place_field (record_layout_info, tree); extern void compute_record_mode (tree); +extern void compute_array_mode (tree); extern void finish_bitfield_layout (tree); extern void finish_record_layout (record_layout_info, int); extern void finalize_size_functions (void); diff --git a/gcc/tree-streamer-in.cc b/gcc/tree-streamer-in.cc index 40029437199..329d218e7d4 100644 --- a/gcc/tree-streamer-in.cc +++ b/gcc/tree-streamer-in.cc @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3. If not see #include "attribs.h" #include "asan.h" #include "opts.h" +#include "stor-layout.h" /* Read a STRING_CST from the string table in DATA_IN using input @@ -395,6 +396,17 @@ unpack_ts_type_common_value_fields (struct bitpack_d *bp, tree expr) #ifdef ACCEL_COMPILER if (TYPE_ALIGN (expr) > targetm.absolute_biggest_alignment) SET_TYPE_ALIGN (expr, targetm.absolute_biggest_alignment); + + /* Host streams out VOIDmode for aggregate type. */ + if (AGGREGATE_TYPE_P (expr) && TYPE_MODE (expr) == VOIDmode) + { + if (TREE_CODE (expr) == ARRAY_TYPE) + compute_array_mode (expr); + else if (RECORD_OR_UNION_TYPE_P (expr)) + compute_record_mode (expr); + else + gcc_unreachable (); + } #endif } diff --git a/gcc/tree-streamer-out.cc b/gcc/tree-streamer-out.cc index b7205287ffb..7de4447a1b5 100644 --- a/gcc/tree-streamer-out.cc +++ b/gcc/tree-streamer-out.cc @@ -187,7 +187,17 @@ pack_ts_fixed_cst_value_fields (struct bitpack_d *bp, tree expr) static void pack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr) { - bp_pack_machine_mode (bp, DECL_MODE (expr)); + /* Similar to TYPE_MODE, avoid streaming out host-specific DECL_MODE + for aggregate type with offloading enabled, and while streaming-in + recompute appropriate DECL_MODE for accelerator. */ + if (lto_stream_offload_p + && (VAR_P (expr) + || TREE_CODE (expr) == PARM_DECL + || TREE_CODE (expr) == FIELD_DECL) + && AGGREGATE_TYPE_P (TREE_TYPE (expr))) + bp_pack_machine_mode (bp, VOIDmode); + else + bp_pack_machine_mode (bp, DECL_MODE (expr)); bp_pack_value (bp, DECL_NONLOCAL (expr), 1); bp_pack_value (bp, DECL_VIRTUAL_P (expr), 1); bp_pack_value (bp, DECL_IGNORED_P (expr), 1); @@ -317,10 +327,18 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree expr) static void pack_ts_type_common_value_fields (struct bitpack_d *bp, tree expr) { + /* For offloading, avoid streaming out TYPE_MODE for aggregate type since + it may be host-specific. For eg, aarch64 uses OImode for ARRAY_TYPE + whose size is 256-bits, which is not representable on accelerator. + Instead stream out VOIDmode, and while streaming-in, recompute + appropriate TYPE_MODE for accelerator. */ + if (lto_stream_offload_p && AGGREGATE_TYPE_P (expr)) + bp_pack_machine_mode (bp, VOIDmode); /* for VECTOR_TYPE, TYPE_MODE reevaluates the mode using target_flags not necessary valid in a global context. Use the raw value previously set by layout_type. */ - bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr)); + else + bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr)); /* TYPE_NO_FORCE_BLK is private to stor-layout and need no streaming. */ bp_pack_value (bp, TYPE_PACKED (expr), 1);