On Fri, Sep 19, 2025, 6:28 PM Peter Damianov <peter0...@disroot.org> wrote:

> This patch implements folding of aggregate assignments (*dest = *src)
> by converting them to scalar MEM_REF operations when the size
> permits. This enables vectorization opportunities.
>
> gcc/ChangeLog:
>
>         PR tree-optimization/99504
>         * tree-ssa-forwprop.cc (fold_aggregate_assignment): New function.
>         Folds aggregate assignments to scalar MEM_REF operations/
>         (pass_forwprop::execute): Call fold_aggregate_assignment for
>         applicable assignment statements.
>
> gcc/testsuite/ChangeLog:
>
>         PR tree-optimization/99504
>         * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that aggregate
>         assignments of various sizes get folded to scalar MEM_REF
>         operations.
>
> Signed-off-by: Peter Damianov <peter0...@disroot.org>
> ---
> v2: Remove int128 part of test because it cannot apply to every target
> and would throw off the count of MEM\\\[
>  gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c |  53 ++++++++
>  gcc/tree-ssa-forwprop.cc                    | 140 ++++++++++++++++++++
>  2 files changed, 193 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
>
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> new file mode 100644
> index 00000000000..aa49cf6a238
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c
> @@ -0,0 +1,53 @@
> +/* PR tree-optimization/99504 */
> +/* Test that aggregate assignments get folded to scalar MEM_REF
> operations */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fdump-tree-forwprop1" } */
> +
> +#include <stdint.h>
> +
> +struct pixel_4 {
> +  uint8_t r, g, b, a;
> +};
> +
> +struct pixel_8 {
> +  uint16_t r, g, b, a;
> +};
> +
> +struct pixel_16 {
> +  uint32_t r, g, b, a;
> +};
> +
> +struct pixel_32 {
> +  uint64_t r, g, b, a;
> +};
> +
> +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src)
> +{
> +  *dest = *src;
> +}
> +
> +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n)
> +{
> +  for (int i = 0; i < n; i++)
> +    dest[i] = src[i];
> +}
> +
> +/* { dg-final { scan-tree-dump-times "MEM\\\[" 8 "forwprop1" } } */

+/* Check that we generate scalar temporaries for the folded assignments */
> +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } } */
> +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" "forwprop1" } }
> */
> \ No newline at end of file
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 43b1c9d696f..3ce94a737c6 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq
>  typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq;
>
>  static bool forward_propagate_addr_expr (tree, tree, bool);
> +static bool fold_aggregate_assignment (gimple_stmt_iterator *);
>
>  /* Set to true if we delete dead edges during the optimization.  */
>  static bool cfg_changed;
> @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree rhs,
> bool parent_single_use_p)
>  }
>
>
> +/* Try to optimize aggregate assignments by converting them to scalar
> +   MEM_REF operations when profitable for vectorization.
> +   This applies the same folding as memcpy to aggregate assignments.  */
> +
> +static bool
> +fold_aggregate_assignment (gimple_stmt_iterator *gsi)
> +{
> +  gimple *stmt = gsi_stmt (*gsi);
> +
> +  if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt))
> +    return false;
> +
> +  tree lhs = gimple_assign_lhs (stmt);
> +  tree rhs = gimple_assign_rhs1 (stmt);
> +
> +  /* Check if this is an aggregate assignment: *dest = *src
> +     where both sides are aggregate types (can be MEM_REF or
> indirection).  */
> +  bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF);
> +  bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF);
> +
> +  if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect)
> +      || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect))
> +    return false;
> +
> +  tree lhs_type = TREE_TYPE (lhs);
> +  tree rhs_type = TREE_TYPE (rhs);
> +
> +  if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type))
> +    return false;
> +
> +  if (!types_compatible_p (lhs_type, rhs_type))
> +    return false;
> +
> +  if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type)))
> +    return false;
> +
> +  unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT (lhs_type));
> +  if (!pow2p_hwi (ilen) || ilen > MOVE_MAX)
> +    return false;
> +
> +  tree lhs_base = TREE_OPERAND (lhs, 0);
> +  tree rhs_base = TREE_OPERAND (rhs, 0);
> +
> +  unsigned int lhs_align = get_pointer_alignment (lhs_base);
> +  unsigned int rhs_align = get_pointer_alignment (rhs_base);
> +
> +  scalar_int_mode imode;
> +  machine_mode mode;
> +  if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode)
> +      || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists (&mode)
> +      || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT))
> +    return false;
> +
> +  if ((lhs_align < GET_MODE_ALIGNMENT (mode)
> +       && targetm.slow_unaligned_access (mode, lhs_align)
> +       && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
> +      || (rhs_align < GET_MODE_ALIGNMENT (mode)
> +         && targetm.slow_unaligned_access (mode, rhs_align)
> +         && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing))
> +    return false;
> +
> +  tree type = bitwise_type_for_mode (mode);
> +  tree srctype = type;
> +  tree desttype = type;
> +
> +  if (rhs_align < GET_MODE_ALIGNMENT (mode))
> +    srctype = build_aligned_type (type, rhs_align);
> +  if (lhs_align < GET_MODE_ALIGNMENT (mode))
> +    desttype = build_aligned_type (type, lhs_align);
> +
> +  tree off0 = build_int_cst (build_pointer_type_for_mode (char_type_node,
> +                                                         ptr_mode, true),
> 0);
>

So you lose the aliasing type here?
Wait that is only used for INDIRECT_REF which does not exist in gimple.

So this does not handle:
A->field1 = b->field1;
Where field1 is an aggregate which is the size as the outer aggregate.
There is a few other things. Like non power of 2.
Plus fre/VN have a harder time reading through a copy via integer.
Also the copy prop for aggregates does not handle copies via scalars yet.
It is one thing to remove a call, e.g. memcpy but it is another to remove
aggregate copies.

Thanks,
Andrew



+
> +  tree srcmem, destmem;
> +
> +  if (rhs_is_indirect)
> +    {
> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0);
> +    }
> +  else
> +    {
> +      tree rhs_offset = TREE_OPERAND (rhs, 1);
> +      srcmem = fold_build2 (MEM_REF, srctype, rhs_base, rhs_offset);
> +    }
> +
> +  if (lhs_is_indirect)
> +    {
> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0);
> +    }
> +  else
> +    {
> +      tree lhs_offset = TREE_OPERAND (lhs, 1);
> +      destmem = fold_build2 (MEM_REF, desttype, lhs_base, lhs_offset);
> +    }
> +  gimple *new_stmt;
> +  if (is_gimple_reg_type (srctype))
> +    {
> +      new_stmt = gimple_build_assign (NULL_TREE, srcmem);
> +      tree tmp_var = make_ssa_name (srctype, new_stmt);
> +      gimple_assign_set_lhs (new_stmt, tmp_var);
> +      gimple_set_vuse (new_stmt, gimple_vuse (stmt));
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +
> +      new_stmt = gimple_build_assign (destmem, tmp_var);
> +      gimple_move_vops (new_stmt, stmt);
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +      gsi_remove (gsi, true);
> +    }
> +  else
> +    {
> +      new_stmt = gimple_build_assign (destmem, srcmem);
> +      gimple_move_vops (new_stmt, stmt);
> +      gimple_set_location (new_stmt, gimple_location (stmt));
> +      gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
> +      gsi_remove (gsi, true);
> +    }
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file,
> +              "Converted aggregate assignment to scalar MEM_REF:\n");
> +      fprintf (dump_file, "  Original: ");
> +      print_gimple_stmt (dump_file, stmt, 0, dump_flags);
> +      fprintf (dump_file, "  Size: %u bytes, Mode: %s\n",
> +              (unsigned)ilen, GET_MODE_NAME (mode));
> +    }
> +
> +  statistics_counter_event (cfun, "aggregate assignment to scalar
> MEM_REF", 1);
> +
> +  return true;
> +}
> +
> +
>  /* Helper function for simplify_gimple_switch.  Remove case labels that
>     have values outside the range of the new type.  */
>
> @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun)
>           if (TREE_CODE (lhs) != SSA_NAME
>               || has_zero_uses (lhs))
>             {
> +             if (TREE_CODE (lhs) != SSA_NAME
> +                 && fold_aggregate_assignment (&gsi))
> +               continue;
> +
>               process_vec_perm_simplify_seq_list
> (&vec_perm_simplify_seq_list);
>               gsi_next (&gsi);
>               continue;
> --
> 2.39.5
>
>

Reply via email to