> -----Original Message-----
> From: Richard Biener <rguent...@suse.de>
> Sent: Tuesday, August 20, 2024 10:36 AM
> To: Richard Sandiford <richard.sandif...@arm.com>
> Cc: Prathamesh Kulkarni <prathame...@nvidia.com>; Thomas Schwinge
> <tschwi...@baylibre.com>; gcc-patches@gcc.gnu.org
> Subject: Re: Re-compute TYPE_MODE and DECL_MODE while streaming in for
> accelerator
> 
> External email: Use caution opening links or attachments
> 
> 
> > Am 19.08.2024 um 20:56 schrieb Richard Sandiford
> <richard.sandif...@arm.com>:
> >
> > Prathamesh Kulkarni <prathame...@nvidia.com> writes:
> >> diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc index
> >> cbf6041fd68..0420183faf8 100644
> >> --- a/gcc/lto-streamer-in.cc
> >> +++ b/gcc/lto-streamer-in.cc
> >> @@ -44,6 +44,7 @@ along with GCC; see the file COPYING3.  If not
> see
> >> #include "debug.h"
> >> #include "alloc-pool.h"
> >> #include "toplev.h"
> >> +#include "stor-layout.h"
> >>
> >> /* Allocator used to hold string slot entries for line map
> streaming.
> >> */ static struct object_allocator<struct string_slot>
> >> *string_slot_allocator; @@ -1752,6 +1753,17 @@ lto_read_tree_1
> (class lto_input_block *ib, class data_in *data_in, tree expr)
> >>     with -g1, see for example PR113488.  */
> >>       else if (DECL_P (expr) && DECL_ABSTRACT_ORIGIN (expr) ==
> expr)
> >>    DECL_ABSTRACT_ORIGIN (expr) = NULL_TREE;
> >> +
> >> +#ifdef ACCEL_COMPILER
> >> +      /* For decl with aggregate type, host streams out VOIDmode.
> >> +     Compute the correct DECL_MODE by calling relayout_decl.  */
> >> +      if ((VAR_P (expr)
> >> +       || TREE_CODE (expr) == PARM_DECL
> >> +       || TREE_CODE (expr) == FIELD_DECL)
> >> +      && AGGREGATE_TYPE_P (TREE_TYPE (expr))
> >> +      && DECL_MODE (expr) == VOIDmode)
> >> +    relayout_decl (expr);
> >> +#endif
> >
> > Genuine question, but: is relayout_decl safe in this context?  It
> does
> > a lot more than just reset the mode.  It also applies the target
> ABI's
> > preferences wrt alignment, padding, and so on, rather than
> preserving
> > those of the host's.
> 
> It would be better to just recompute the mode here.
Hi,
The attached patch sets DECL_MODE (expr) to TYPE_MODE (TREE_TYPE (expr)) in 
lto_read_tree_1 instead of calling relayout_decl (expr).
I checked layout_decl_type does the same thing for setting decl mode, except 
for bit fields. Since bit-fields cannot have
aggregate type, I am assuming setting DECL_MODE (expr) to TYPE_MODE (TREE_TYPE 
(expr)) would be OK in this case ?

Sorry if this sounds like a silly ques -- Why would it be unsafe to call 
relayout_decl for variables that are mapped to accelerator even
if it'd not preserve host's properties ? I assumed we want to assign accel's 
ABI properties for mapped decls (mode being one of them),
or am I misunderstanding ?

Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com>

Thanks,
Prathamesh                                                                      
                   
> 
> Richard
> 
> > Thanks,
> > Richard
> >
> >
> >>     }
> >> }
> >>
> >> diff --git a/gcc/stor-layout.cc b/gcc/stor-layout.cc index
> >> 10c0809914c..0ff8bd1171e 100644
> >> --- a/gcc/stor-layout.cc
> >> +++ b/gcc/stor-layout.cc
> >> @@ -2396,6 +2396,32 @@ finish_builtin_struct (tree type, const char
> *name, tree fields,
> >>   layout_decl (TYPE_NAME (type), 0);
> >> }
> >>
> >> +/* Compute TYPE_MODE for TYPE (which is ARRAY_TYPE).  */
> >> +
> >> +void compute_array_mode (tree type)
> >> +{
> >> +  gcc_assert (TREE_CODE (type) == ARRAY_TYPE);
> >> +
> >> +  SET_TYPE_MODE (type, BLKmode);
> >> +  if (TYPE_SIZE (type) != 0
> >> +      && ! targetm.member_type_forces_blk (type, VOIDmode)
> >> +      /* BLKmode elements force BLKmode aggregate;
> >> +     else extract/store fields may lose.  */
> >> +      && (TYPE_MODE (TREE_TYPE (type)) != BLKmode
> >> +      || TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
> >> +    {
> >> +      SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
> >> +                       TYPE_SIZE (type)));
> >> +      if (TYPE_MODE (type) != BLKmode
> >> +      && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT
> >> +      && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE
> (type)))
> >> +    {
> >> +      TYPE_NO_FORCE_BLK (type) = 1;
> >> +      SET_TYPE_MODE (type, BLKmode);
> >> +    }
> >> +    }
> >> +}
> >> +
> >> /* Calculate the mode, size, and alignment for TYPE.
> >>    For an array type, calculate the element separation as well.
> >>    Record TYPE on the chain of permanent or temporary types @@
> >> -2709,24 +2735,7 @@ layout_type (tree type)
> >>    align = MAX (align, BITS_PER_UNIT); #endif
> >>    SET_TYPE_ALIGN (type, align);
> >> -    SET_TYPE_MODE (type, BLKmode);
> >> -    if (TYPE_SIZE (type) != 0
> >> -        && ! targetm.member_type_forces_blk (type, VOIDmode)
> >> -        /* BLKmode elements force BLKmode aggregate;
> >> -           else extract/store fields may lose.  */
> >> -        && (TYPE_MODE (TREE_TYPE (type)) != BLKmode
> >> -        || TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
> >> -      {
> >> -        SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
> >> -                         TYPE_SIZE (type)));
> >> -        if (TYPE_MODE (type) != BLKmode
> >> -        && STRICT_ALIGNMENT && TYPE_ALIGN (type) <
> BIGGEST_ALIGNMENT
> >> -        && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE
> (type)))
> >> -          {
> >> -        TYPE_NO_FORCE_BLK (type) = 1;
> >> -        SET_TYPE_MODE (type, BLKmode);
> >> -          }
> >> -      }
> >> +    compute_array_mode (type);
> >>    if (AGGREGATE_TYPE_P (element))
> >>      TYPE_TYPELESS_STORAGE (type) = TYPE_TYPELESS_STORAGE
> (element);
> >>    /* When the element size is constant, check that it is at least
> as
> >> diff --git a/gcc/stor-layout.h b/gcc/stor-layout.h index
> >> 096ca811762..9d9b8c385f6 100644
> >> --- a/gcc/stor-layout.h
> >> +++ b/gcc/stor-layout.h
> >> @@ -34,6 +34,7 @@ extern tree rli_size_so_far (record_layout_info);
> >> extern void normalize_rli (record_layout_info); extern void
> >> place_field (record_layout_info, tree); extern void
> >> compute_record_mode (tree);
> >> +extern void compute_array_mode (tree);
> >> extern void finish_bitfield_layout (tree); extern void
> >> finish_record_layout (record_layout_info, int); extern void
> >> finalize_size_functions (void); diff --git a/gcc/tree-streamer-
> in.cc
> >> b/gcc/tree-streamer-in.cc index 40029437199..329d218e7d4 100644
> >> --- a/gcc/tree-streamer-in.cc
> >> +++ b/gcc/tree-streamer-in.cc
> >> @@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not
> see
> >> #include "attribs.h"
> >> #include "asan.h"
> >> #include "opts.h"
> >> +#include "stor-layout.h"
> >>
> >>
> >> /* Read a STRING_CST from the string table in DATA_IN using input
> @@
> >> -395,6 +396,17 @@ unpack_ts_type_common_value_fields (struct
> >> bitpack_d *bp, tree expr) #ifdef ACCEL_COMPILER
> >>   if (TYPE_ALIGN (expr) > targetm.absolute_biggest_alignment)
> >>     SET_TYPE_ALIGN (expr, targetm.absolute_biggest_alignment);
> >> +
> >> +  /* Host streams out VOIDmode for aggregate type. */  if
> >> + (AGGREGATE_TYPE_P (expr) && TYPE_MODE (expr) == VOIDmode)
> >> +    {
> >> +      if (TREE_CODE (expr) == ARRAY_TYPE)
> >> +    compute_array_mode (expr);
> >> +      else if (RECORD_OR_UNION_TYPE_P (expr))
> >> +    compute_record_mode (expr);
> >> +      else
> >> +    gcc_unreachable ();
> >> +    }
> >> #endif
> >> }
> >>
> >> diff --git a/gcc/tree-streamer-out.cc b/gcc/tree-streamer-out.cc
> >> index b7205287ffb..7de4447a1b5 100644
> >> --- a/gcc/tree-streamer-out.cc
> >> +++ b/gcc/tree-streamer-out.cc
> >> @@ -187,7 +187,17 @@ pack_ts_fixed_cst_value_fields (struct
> bitpack_d
> >> *bp, tree expr) static void pack_ts_decl_common_value_fields
> (struct
> >> bitpack_d *bp, tree expr) {
> >> -  bp_pack_machine_mode (bp, DECL_MODE (expr));
> >> +  /* Similar to TYPE_MODE, avoid streaming out host-specific
> DECL_MODE
> >> +     for aggregate type with offloading enabled, and while
> streaming-in
> >> +     recompute appropriate DECL_MODE for accelerator.  */  if
> >> + (lto_stream_offload_p
> >> +      && (VAR_P (expr)
> >> +      || TREE_CODE (expr) == PARM_DECL
> >> +      || TREE_CODE (expr) == FIELD_DECL)
> >> +      && AGGREGATE_TYPE_P (TREE_TYPE (expr)))
> >> +    bp_pack_machine_mode (bp, VOIDmode);  else
> >> +    bp_pack_machine_mode (bp, DECL_MODE (expr));
> >>   bp_pack_value (bp, DECL_NONLOCAL (expr), 1);
> >>   bp_pack_value (bp, DECL_VIRTUAL_P (expr), 1);
> >>   bp_pack_value (bp, DECL_IGNORED_P (expr), 1); @@ -317,10 +327,18
> @@
> >> pack_ts_function_decl_value_fields (struct bitpack_d *bp, tree
> expr)
> >> static void pack_ts_type_common_value_fields (struct bitpack_d *bp,
> >> tree expr) {
> >> +  /* For offloading, avoid streaming out TYPE_MODE for aggregate
> type since
> >> +     it may be host-specific. For eg, aarch64 uses OImode for
> ARRAY_TYPE
> >> +     whose size is 256-bits, which is not representable on
> accelerator.
> >> +     Instead stream out VOIDmode, and while streaming-in,
> recompute
> >> +     appropriate TYPE_MODE for accelerator.  */  if
> >> + (lto_stream_offload_p && AGGREGATE_TYPE_P (expr))
> >> +    bp_pack_machine_mode (bp, VOIDmode);
> >>   /* for VECTOR_TYPE, TYPE_MODE reevaluates the mode using
> target_flags
> >>      not necessary valid in a global context.
> >>      Use the raw value previously set by layout_type.  */
> >> -  bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr));
> >> +  else
> >> +    bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr));
> >>   /* TYPE_NO_FORCE_BLK is private to stor-layout and need
> >>      no streaming.  */
> >>   bp_pack_value (bp, TYPE_PACKED (expr), 1);
Recompute TYPE_MODE and DECL_MODE for aggregate type for acclerator.

The patch streams out VOIDmode for aggregate types with offloading enabled,
and recomputes appropriate TYPE_MODE and DECL_MODE while streaming-in on accel
side. The rationale for this change is to avoid streaming out host-specific
modes that may be used for aggregate types, which may not be representable on
the accelerator. For eg, AArch64 uses OImode for ARRAY_TYPE whose size is 
256-bits,
and nvptx doesn't have OImode, and thus ends up emitting an error from
lto_input_mode_table.

gcc/ChangeLog:
        * lto-streamer-in.cc: (lto_read_tree_1): Set DECL_MODE (expr) to
        TREE_TYPE (TYPE_MODE (expr)) if TREE_TYPE (expr) is aggregate type and
        offloading is enabled.
        * stor-layout.cc (layout_type): Move computation of mode for
        ARRAY_TYPE from ...
        (compute_array_mode): ... to here.
        * stor-layout.h (compute_array_mode): Declare.
        * tree-streamer-in.cc: Include stor-layout.h.
        (unpack_ts_common_value_fields): Call compute_array_mode if offloading
        is enabled.
        * tree-streamer-out.cc (pack_ts_fixed_cst_value_fields): Stream out
        VOIDmode if decl has aggregate type and offloading is enabled.
        (pack_ts_type_common_value_fields): Stream out VOIDmode for aggregate
        type if offloading is enabled.

Signed-off-by: Prathamesh Kulkarni <prathame...@nvidia.com>

diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
index cbf6041fd68..64f75807328 100644
--- a/gcc/lto-streamer-in.cc
+++ b/gcc/lto-streamer-in.cc
@@ -1752,6 +1752,15 @@ lto_read_tree_1 (class lto_input_block *ib, class 
data_in *data_in, tree expr)
         with -g1, see for example PR113488.  */
       else if (DECL_P (expr) && DECL_ABSTRACT_ORIGIN (expr) == expr)
        DECL_ABSTRACT_ORIGIN (expr) = NULL_TREE;
+
+#ifdef ACCEL_COMPILER
+      if ((VAR_P (expr)
+          || TREE_CODE (expr) == PARM_DECL
+          || TREE_CODE (expr) == FIELD_DECL)
+         && AGGREGATE_TYPE_P (TREE_TYPE (expr))
+         && DECL_MODE (expr) == VOIDmode)
+       SET_DECL_MODE (expr, TYPE_MODE (TREE_TYPE (expr)));
+#endif
     }
 }
 
diff --git a/gcc/stor-layout.cc b/gcc/stor-layout.cc
index 10c0809914c..0ff8bd1171e 100644
--- a/gcc/stor-layout.cc
+++ b/gcc/stor-layout.cc
@@ -2396,6 +2396,32 @@ finish_builtin_struct (tree type, const char *name, tree 
fields,
   layout_decl (TYPE_NAME (type), 0);
 }
 
+/* Compute TYPE_MODE for TYPE (which is ARRAY_TYPE).  */
+
+void compute_array_mode (tree type)
+{
+  gcc_assert (TREE_CODE (type) == ARRAY_TYPE);
+
+  SET_TYPE_MODE (type, BLKmode);
+  if (TYPE_SIZE (type) != 0
+      && ! targetm.member_type_forces_blk (type, VOIDmode)
+      /* BLKmode elements force BLKmode aggregate;
+        else extract/store fields may lose.  */
+      && (TYPE_MODE (TREE_TYPE (type)) != BLKmode
+         || TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
+    {
+      SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
+                                          TYPE_SIZE (type)));
+      if (TYPE_MODE (type) != BLKmode
+         && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT
+         && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
+       {
+         TYPE_NO_FORCE_BLK (type) = 1;
+         SET_TYPE_MODE (type, BLKmode);
+       }
+    }
+}
+
 /* Calculate the mode, size, and alignment for TYPE.
    For an array type, calculate the element separation as well.
    Record TYPE on the chain of permanent or temporary types
@@ -2709,24 +2735,7 @@ layout_type (tree type)
        align = MAX (align, BITS_PER_UNIT);
 #endif
        SET_TYPE_ALIGN (type, align);
-       SET_TYPE_MODE (type, BLKmode);
-       if (TYPE_SIZE (type) != 0
-           && ! targetm.member_type_forces_blk (type, VOIDmode)
-           /* BLKmode elements force BLKmode aggregate;
-              else extract/store fields may lose.  */
-           && (TYPE_MODE (TREE_TYPE (type)) != BLKmode
-               || TYPE_NO_FORCE_BLK (TREE_TYPE (type))))
-         {
-           SET_TYPE_MODE (type, mode_for_array (TREE_TYPE (type),
-                                                TYPE_SIZE (type)));
-           if (TYPE_MODE (type) != BLKmode
-               && STRICT_ALIGNMENT && TYPE_ALIGN (type) < BIGGEST_ALIGNMENT
-               && TYPE_ALIGN (type) < GET_MODE_ALIGNMENT (TYPE_MODE (type)))
-             {
-               TYPE_NO_FORCE_BLK (type) = 1;
-               SET_TYPE_MODE (type, BLKmode);
-             }
-         }
+       compute_array_mode (type);
        if (AGGREGATE_TYPE_P (element))
          TYPE_TYPELESS_STORAGE (type) = TYPE_TYPELESS_STORAGE (element);
        /* When the element size is constant, check that it is at least as
diff --git a/gcc/stor-layout.h b/gcc/stor-layout.h
index 096ca811762..9d9b8c385f6 100644
--- a/gcc/stor-layout.h
+++ b/gcc/stor-layout.h
@@ -34,6 +34,7 @@ extern tree rli_size_so_far (record_layout_info);
 extern void normalize_rli (record_layout_info);
 extern void place_field (record_layout_info, tree);
 extern void compute_record_mode (tree);
+extern void compute_array_mode (tree);
 extern void finish_bitfield_layout (tree);
 extern void finish_record_layout (record_layout_info, int);
 extern void finalize_size_functions (void);
diff --git a/gcc/tree-streamer-in.cc b/gcc/tree-streamer-in.cc
index 40029437199..329d218e7d4 100644
--- a/gcc/tree-streamer-in.cc
+++ b/gcc/tree-streamer-in.cc
@@ -35,6 +35,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "attribs.h"
 #include "asan.h"
 #include "opts.h"
+#include "stor-layout.h"
 
 
 /* Read a STRING_CST from the string table in DATA_IN using input
@@ -395,6 +396,17 @@ unpack_ts_type_common_value_fields (struct bitpack_d *bp, 
tree expr)
 #ifdef ACCEL_COMPILER
   if (TYPE_ALIGN (expr) > targetm.absolute_biggest_alignment)
     SET_TYPE_ALIGN (expr, targetm.absolute_biggest_alignment);
+
+  /* Host streams out VOIDmode for aggregate type. */
+  if (AGGREGATE_TYPE_P (expr) && TYPE_MODE (expr) == VOIDmode)
+    {
+      if (TREE_CODE (expr) == ARRAY_TYPE)
+       compute_array_mode (expr);
+      else if (RECORD_OR_UNION_TYPE_P (expr))
+       compute_record_mode (expr);
+      else
+       gcc_unreachable ();
+    }
 #endif
 }
 
diff --git a/gcc/tree-streamer-out.cc b/gcc/tree-streamer-out.cc
index b7205287ffb..7de4447a1b5 100644
--- a/gcc/tree-streamer-out.cc
+++ b/gcc/tree-streamer-out.cc
@@ -187,7 +187,17 @@ pack_ts_fixed_cst_value_fields (struct bitpack_d *bp, tree 
expr)
 static void
 pack_ts_decl_common_value_fields (struct bitpack_d *bp, tree expr)
 {
-  bp_pack_machine_mode (bp, DECL_MODE (expr));
+  /* Similar to TYPE_MODE, avoid streaming out host-specific DECL_MODE
+     for aggregate type with offloading enabled, and while streaming-in
+     recompute appropriate DECL_MODE for accelerator.  */
+  if (lto_stream_offload_p
+      && (VAR_P (expr)
+         || TREE_CODE (expr) == PARM_DECL
+         || TREE_CODE (expr) == FIELD_DECL)
+      && AGGREGATE_TYPE_P (TREE_TYPE (expr)))
+    bp_pack_machine_mode (bp, VOIDmode);
+  else
+    bp_pack_machine_mode (bp, DECL_MODE (expr));
   bp_pack_value (bp, DECL_NONLOCAL (expr), 1);
   bp_pack_value (bp, DECL_VIRTUAL_P (expr), 1);
   bp_pack_value (bp, DECL_IGNORED_P (expr), 1);
@@ -317,10 +327,18 @@ pack_ts_function_decl_value_fields (struct bitpack_d *bp, 
tree expr)
 static void
 pack_ts_type_common_value_fields (struct bitpack_d *bp, tree expr)
 {
+  /* For offloading, avoid streaming out TYPE_MODE for aggregate type since
+     it may be host-specific. For eg, aarch64 uses OImode for ARRAY_TYPE
+     whose size is 256-bits, which is not representable on accelerator.
+     Instead stream out VOIDmode, and while streaming-in, recompute
+     appropriate TYPE_MODE for accelerator.  */
+  if (lto_stream_offload_p && AGGREGATE_TYPE_P (expr))
+    bp_pack_machine_mode (bp, VOIDmode);
   /* for VECTOR_TYPE, TYPE_MODE reevaluates the mode using target_flags
      not necessary valid in a global context.
      Use the raw value previously set by layout_type.  */
-  bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr));
+  else
+    bp_pack_machine_mode (bp, TYPE_MODE_RAW (expr));
   /* TYPE_NO_FORCE_BLK is private to stor-layout and need
      no streaming.  */
   bp_pack_value (bp, TYPE_PACKED (expr), 1);

Reply via email to