https://gcc.gnu.org/g:d1c5edc94b6d07ec29a93572f3b5086e88bf3b0e

commit r15-8049-gd1c5edc94b6d07ec29a93572f3b5086e88bf3b0e
Author: Richard Sandiford <richard.sandif...@arm.com>
Date:   Fri Mar 14 10:28:01 2025 +0000

    vect: Fix aarch64/pr99873_2.c ld4/st4 failure
    
    vect_slp_prefer_store_lanes_p allows an SLP tree to be split even
    if the tree could use store-lanes, provided that one of the new
    groups would operate on full vectors for each scalar iteration.
    That heuristic is no longer firing for gcc.target/aarch64/pr99873_2.c.
    
    The test contains:
    
    void __attribute ((noipa))
    foo (uint64_t *__restrict x, uint64_t *__restrict y, int n)
    {
      for (int i = 0; i < n; i += 4)
        {
          x[i] += y[i];
          x[i + 1] += y[i + 1];
          x[i + 2] |= y[i + 2];
          x[i + 3] |= y[i + 3];
        }
    }
    
    and wants us to use V2DI for the first two elements and V2DI for
    the second two elements, rather than LD4s and ST4s.  This gives:
    
    .L3:
            ldp     q31, q0, [x0]
            add     w3, w3, 1
            ldp     q29, q30, [x1], 32
            orr     v30.16b, v0.16b, v30.16b
            add     v31.2d, v29.2d, v31.2d
            stp     q31, q30, [x0], 32
            cmp     w2, w3
            bhi     .L3
    
    instead of:
    
    .L4:
            ld4     {v28.2d - v31.2d}, [x2]
            ld4     {v24.2d - v27.2d}, [x3], 64
            add     v24.2d, v28.2d, v24.2d
            add     v25.2d, v29.2d, v25.2d
            orr     v26.16b, v30.16b, v26.16b
            orr     v27.16b, v31.16b, v27.16b
            st4     {v24.2d - v27.2d}, [x2], 64
            cmp     x2, x5
            bne     .L4
    
    The first loop only handles half the amount of data per iteration,
    but it requires far fewer internal permutations.
    
    One reason the heuristic no longer fired looks like a typo: the call
    to vect_slp_prefer_store_lanes_p was passing "1" as the new group size,
    instead of "i".
    
    However, even with that fixed, vect_analyze_slp later falls back on
    single-lane SLP with load/store lanes.  I think that heuristic too
    should use vect_slp_prefer_store_lanes_p (but it otherwise looks good).
    
    The question is whether every load should pass
    vect_slp_prefer_store_lanes_p or whether just one is enough.
    I don't have an example that would make the call either way,
    so I went for the latter, given that it's the smaller change
    from the status quo.
    
    This also appears to improve fotonik3d and roms from SPEC2017
    (cross-checked against two different systems).
    
    gcc/
            * tree-vect-slp.cc (vect_build_slp_instance): Pass the new group
            size (i) rather than 1 to vect_slp_prefer_store_lanes_p.
            (vect_analyze_slp): Only force the use of load-lanes and
            store-lanes if that is preferred for at least one load/store pair.

Diff:
---
 gcc/tree-vect-slp.cc | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 9e09f8e980b5..ecb4a6521dec 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -4161,7 +4161,7 @@ vect_build_slp_instance (vec_info *vinfo,
               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
               && compare_step_with_zero (vinfo, stmt_info) > 0
               && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
-                                                masked_p, group_size, 1));
+                                                masked_p, group_size, i));
          if (want_store_lanes || force_single_lane)
            i = 1;
 
@@ -5095,7 +5095,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
          && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
        {
          slp_tree slp_root = SLP_INSTANCE_TREE (instance);
-         int group_size = SLP_TREE_LANES (slp_root);
+         unsigned int group_size = SLP_TREE_LANES (slp_root);
          tree vectype = SLP_TREE_VECTYPE (slp_root);
 
          stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
@@ -5138,6 +5138,7 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
          if (loads_permuted)
            {
              bool can_use_lanes = true;
+             bool prefer_load_lanes = false;
              FOR_EACH_VEC_ELT (loads, j, load_node)
                if (STMT_VINFO_GROUPED_ACCESS
                      (SLP_TREE_REPRESENTATIVE (load_node)))
@@ -5165,9 +5166,21 @@ vect_analyze_slp (vec_info *vinfo, unsigned 
max_tree_size,
                        can_use_lanes = false;
                        break;
                      }
+                   /* Make sure that the target would prefer store-lanes
+                      for at least one of the loads.
+
+                      ??? Perhaps we should instead require this for
+                      all loads?  */
+                   prefer_load_lanes
+                     = (prefer_load_lanes
+                        || SLP_TREE_LANES (load_node) == group_size
+                        || (vect_slp_prefer_store_lanes_p
+                            (vinfo, stmt_vinfo,
+                             STMT_VINFO_VECTYPE (stmt_vinfo), masked,
+                             group_size, SLP_TREE_LANES (load_node))));
                  }
 
-             if (can_use_lanes)
+             if (can_use_lanes && prefer_load_lanes)
                {
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_NOTE, vect_location,

Reply via email to