On Fri, Sep 19, 2025, 6:28 PM Peter Damianov <peter0...@disroot.org> wrote:
> This patch implements folding of aggregate assignments (*dest = *src) > by converting them to scalar MEM_REF operations when the size > permits. This enables vectorization opportunities. > > gcc/ChangeLog: > > PR tree-optimization/99504 > * tree-ssa-forwprop.cc (fold_aggregate_assignment): New function. > Folds aggregate assignments to scalar MEM_REF operations/ > (pass_forwprop::execute): Call fold_aggregate_assignment for > applicable assignment statements. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/99504 > * gcc.dg/tree-ssa/forwprop-42.c: New test. Verify that aggregate > assignments of various sizes get folded to scalar MEM_REF > operations. > > Signed-off-by: Peter Damianov <peter0...@disroot.org> > --- > v2: Remove int128 part of test because it cannot apply to every target > and would throw off the count of MEM\\\[ > gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c | 53 ++++++++ > gcc/tree-ssa-forwprop.cc | 140 ++++++++++++++++++++ > 2 files changed, 193 insertions(+) > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > new file mode 100644 > index 00000000000..aa49cf6a238 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/forwprop-42.c > @@ -0,0 +1,53 @@ > +/* PR tree-optimization/99504 */ > +/* Test that aggregate assignments get folded to scalar MEM_REF > operations */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-forwprop1" } */ > + > +#include <stdint.h> > + > +struct pixel_4 { > + uint8_t r, g, b, a; > +}; > + > +struct pixel_8 { > + uint16_t r, g, b, a; > +}; > + > +struct pixel_16 { > + uint32_t r, g, b, a; > +}; > + > +struct pixel_32 { > + uint64_t r, g, b, a; > +}; > + > +void test_4_bytes(struct pixel_4 *dest, struct pixel_4 *src) > +{ > + *dest = *src; > +} > + > +void test_8_bytes(struct pixel_8 *dest, struct pixel_8 *src) > +{ > + *dest = *src; > +} > + > +void test_16_bytes(struct pixel_16 *dest, struct pixel_16 *src) > +{ > + *dest = *src; > +} > + > +void test_32_bytes(struct pixel_32 *dest, struct pixel_32 *src) > +{ > + *dest = *src; > +} > + > +void copy_pixels(struct pixel_4 *dest, struct pixel_4 *src, int n) > +{ > + for (int i = 0; i < n; i++) > + dest[i] = src[i]; > +} > + > +/* { dg-final { scan-tree-dump-times "MEM\\\[" 8 "forwprop1" } } */ +/* Check that we generate scalar temporaries for the folded assignments */ > +/* { dg-final { scan-tree-dump "_\[0-9\]+ = MEM\\\[" "forwprop1" } } */ > +/* { dg-final { scan-tree-dump "MEM\\\[.*\] = _\[0-9\]+" "forwprop1" } } > */ > \ No newline at end of file > diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc > index 43b1c9d696f..3ce94a737c6 100644 > --- a/gcc/tree-ssa-forwprop.cc > +++ b/gcc/tree-ssa-forwprop.cc > @@ -205,6 +205,7 @@ struct _vec_perm_simplify_seq > typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; > > static bool forward_propagate_addr_expr (tree, tree, bool); > +static bool fold_aggregate_assignment (gimple_stmt_iterator *); > > /* Set to true if we delete dead edges during the optimization. */ > static bool cfg_changed; > @@ -981,6 +982,141 @@ forward_propagate_addr_expr (tree name, tree rhs, > bool parent_single_use_p) > } > > > +/* Try to optimize aggregate assignments by converting them to scalar > + MEM_REF operations when profitable for vectorization. > + This applies the same folding as memcpy to aggregate assignments. */ > + > +static bool > +fold_aggregate_assignment (gimple_stmt_iterator *gsi) > +{ > + gimple *stmt = gsi_stmt (*gsi); > + > + if (!is_gimple_assign (stmt) || !gimple_assign_single_p (stmt)) > + return false; > + > + tree lhs = gimple_assign_lhs (stmt); > + tree rhs = gimple_assign_rhs1 (stmt); > + > + /* Check if this is an aggregate assignment: *dest = *src > + where both sides are aggregate types (can be MEM_REF or > indirection). */ > + bool lhs_is_indirect = (TREE_CODE (lhs) == INDIRECT_REF); > + bool rhs_is_indirect = (TREE_CODE (rhs) == INDIRECT_REF); > + > + if ((TREE_CODE (lhs) != MEM_REF && !lhs_is_indirect) > + || (TREE_CODE (rhs) != MEM_REF && !rhs_is_indirect)) > + return false; > + > + tree lhs_type = TREE_TYPE (lhs); > + tree rhs_type = TREE_TYPE (rhs); > + > + if (!AGGREGATE_TYPE_P (lhs_type) || !AGGREGATE_TYPE_P (rhs_type)) > + return false; > + > + if (!types_compatible_p (lhs_type, rhs_type)) > + return false; > + > + if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (lhs_type))) > + return false; > + > + unsigned HOST_WIDE_INT ilen = tree_to_uhwi (TYPE_SIZE_UNIT (lhs_type)); > + if (!pow2p_hwi (ilen) || ilen > MOVE_MAX) > + return false; > + > + tree lhs_base = TREE_OPERAND (lhs, 0); > + tree rhs_base = TREE_OPERAND (rhs, 0); > + > + unsigned int lhs_align = get_pointer_alignment (lhs_base); > + unsigned int rhs_align = get_pointer_alignment (rhs_base); > + > + scalar_int_mode imode; > + machine_mode mode; > + if (!int_mode_for_size (ilen * BITS_PER_UNIT, 0).exists (&imode) > + || !bitwise_mode_for_size (ilen * BITS_PER_UNIT).exists (&mode) > + || !known_eq (GET_MODE_BITSIZE (mode), ilen * BITS_PER_UNIT)) > + return false; > + > + if ((lhs_align < GET_MODE_ALIGNMENT (mode) > + && targetm.slow_unaligned_access (mode, lhs_align) > + && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) > + || (rhs_align < GET_MODE_ALIGNMENT (mode) > + && targetm.slow_unaligned_access (mode, rhs_align) > + && optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)) > + return false; > + > + tree type = bitwise_type_for_mode (mode); > + tree srctype = type; > + tree desttype = type; > + > + if (rhs_align < GET_MODE_ALIGNMENT (mode)) > + srctype = build_aligned_type (type, rhs_align); > + if (lhs_align < GET_MODE_ALIGNMENT (mode)) > + desttype = build_aligned_type (type, lhs_align); > + > + tree off0 = build_int_cst (build_pointer_type_for_mode (char_type_node, > + ptr_mode, true), > 0); > So you lose the aliasing type here? Wait that is only used for INDIRECT_REF which does not exist in gimple. So this does not handle: A->field1 = b->field1; Where field1 is an aggregate which is the size as the outer aggregate. There is a few other things. Like non power of 2. Plus fre/VN have a harder time reading through a copy via integer. Also the copy prop for aggregates does not handle copies via scalars yet. It is one thing to remove a call, e.g. memcpy but it is another to remove aggregate copies. Thanks, Andrew + > + tree srcmem, destmem; > + > + if (rhs_is_indirect) > + { > + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, off0); > + } > + else > + { > + tree rhs_offset = TREE_OPERAND (rhs, 1); > + srcmem = fold_build2 (MEM_REF, srctype, rhs_base, rhs_offset); > + } > + > + if (lhs_is_indirect) > + { > + destmem = fold_build2 (MEM_REF, desttype, lhs_base, off0); > + } > + else > + { > + tree lhs_offset = TREE_OPERAND (lhs, 1); > + destmem = fold_build2 (MEM_REF, desttype, lhs_base, lhs_offset); > + } > + gimple *new_stmt; > + if (is_gimple_reg_type (srctype)) > + { > + new_stmt = gimple_build_assign (NULL_TREE, srcmem); > + tree tmp_var = make_ssa_name (srctype, new_stmt); > + gimple_assign_set_lhs (new_stmt, tmp_var); > + gimple_set_vuse (new_stmt, gimple_vuse (stmt)); > + gimple_set_location (new_stmt, gimple_location (stmt)); > + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > + > + new_stmt = gimple_build_assign (destmem, tmp_var); > + gimple_move_vops (new_stmt, stmt); > + gimple_set_location (new_stmt, gimple_location (stmt)); > + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > + gsi_remove (gsi, true); > + } > + else > + { > + new_stmt = gimple_build_assign (destmem, srcmem); > + gimple_move_vops (new_stmt, stmt); > + gimple_set_location (new_stmt, gimple_location (stmt)); > + gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT); > + gsi_remove (gsi, true); > + } > + > + if (dump_file && (dump_flags & TDF_DETAILS)) > + { > + fprintf (dump_file, > + "Converted aggregate assignment to scalar MEM_REF:\n"); > + fprintf (dump_file, " Original: "); > + print_gimple_stmt (dump_file, stmt, 0, dump_flags); > + fprintf (dump_file, " Size: %u bytes, Mode: %s\n", > + (unsigned)ilen, GET_MODE_NAME (mode)); > + } > + > + statistics_counter_event (cfun, "aggregate assignment to scalar > MEM_REF", 1); > + > + return true; > +} > + > + > /* Helper function for simplify_gimple_switch. Remove case labels that > have values outside the range of the new type. */ > > @@ -4477,6 +4613,10 @@ pass_forwprop::execute (function *fun) > if (TREE_CODE (lhs) != SSA_NAME > || has_zero_uses (lhs)) > { > + if (TREE_CODE (lhs) != SSA_NAME > + && fold_aggregate_assignment (&gsi)) > + continue; > + > process_vec_perm_simplify_seq_list > (&vec_perm_simplify_seq_list); > gsi_next (&gsi); > continue; > -- > 2.39.5 > >