Hi All,

With the support to SLP only we now pass the VMAT through the SLP node, however
the majority of the costing calls inside vectorizable_load and
vectorizable_store do no pass the SLP node along.  Due to this the backend 
costing
never sees the VMAT for these cases anymore.

Additionally the helper around record_stmt_cost when both SLP and stmt_vinfo are
passed would only pass the SLP node along.  However the SLP node doesn't contain
all the info available in the stmt_vinfo and we'd have to go through the
SLP_TREE_REPRESENTATIVE anyway.  As such I changed the function to just Always
pass both along.  Unlike the VMAT changes, I don't believe there to be a
correctness issue here but would minimize the number of churn in the backend
costing until vectorizer costing as a whole is revisited in GCC 16.

These changes re-enable the cost model on AArch64 and also correctly find the
VMATs on loads and stores fixing testcases such as sve_iters_low_2.c.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu -m32, -m64 and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-data-refs.cc (vect_get_data_access_cost): Pass NULL for SLP
        node.
        * tree-vect-stmts.cc (record_stmt_cost): Expose.
        (vect_get_store_cost, vect_get_load_cost): Extend with SLP node.
        (vectorizable_store, vectorizable_load): Pass SLP node to all costing.
        * tree-vectorizer.h (record_stmt_cost): Always pass both SLP node and
        stmt_vinfo to costing.
        (vect_get_load_cost, vect_get_store_cost): Extend with SLP node.

---
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 
3ea5fb883b1a5289195142171eb45fa422910a95..d87ca79b8e4c16d242e67431d1b527bdb8cb74e4
 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1729,12 +1729,14 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info 
*dr_info,
     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
 
   if (DR_IS_READ (dr_info->dr))
-    vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
-                       misalignment, true, inside_cost,
-                       outside_cost, prologue_cost_vec, body_cost_vec, false);
+    vect_get_load_cost (vinfo, stmt_info, NULL, ncopies,
+                       alignment_support_scheme, misalignment, true,
+                       inside_cost, outside_cost, prologue_cost_vec,
+                       body_cost_vec, false);
   else
-    vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
-                        misalignment, inside_cost, body_cost_vec);
+    vect_get_store_cost (vinfo,stmt_info, NULL, ncopies,
+                        alignment_support_scheme, misalignment, inside_cost,
+                        body_cost_vec);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
7a92da00f7ddcfdf146fa1c2511f609e8bc40e9e..46543c15c00f00e5127d06446f58fce79951c3b0
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -93,7 +93,7 @@ stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info 
*stmt_info)
    target model or by saving it in a vector for later processing.
    Return a preliminary estimate of the statement's cost.  */
 
-static unsigned
+unsigned
 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
                  enum vect_cost_for_stmt kind,
                  stmt_vec_info stmt_info, slp_tree node,
@@ -1008,8 +1008,8 @@ cfun_returns (tree decl)
 
 /* Calculate cost of DR's memory access.  */
 void
-vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
-                    dr_alignment_support alignment_support_scheme,
+vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
+                    int ncopies, dr_alignment_support alignment_support_scheme,
                     int misalignment,
                     unsigned int *inside_cost,
                     stmt_vector_for_cost *body_cost_vec)
@@ -1019,7 +1019,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, 
int ncopies,
     case dr_aligned:
       {
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-                                         vector_store, stmt_info, 0,
+                                         vector_store, stmt_info, slp_node, 0,
                                          vect_body);
 
         if (dump_enabled_p ())
@@ -1032,7 +1032,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, 
int ncopies,
       {
         /* Here, we assign an additional cost for the unaligned store.  */
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-                                         unaligned_store, stmt_info,
+                                         unaligned_store, stmt_info, slp_node,
                                          misalignment, vect_body);
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1058,8 +1058,8 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, 
int ncopies,
 
 /* Calculate cost of DR's memory access.  */
 void
-vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
-                   dr_alignment_support alignment_support_scheme,
+vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
+                   int ncopies, dr_alignment_support alignment_support_scheme,
                    int misalignment,
                    bool add_realign_cost, unsigned int *inside_cost,
                    unsigned int *prologue_cost,
@@ -1072,7 +1072,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, 
int ncopies,
     case dr_aligned:
       {
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
-                                         stmt_info, 0, vect_body);
+                                         stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1084,7 +1084,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, 
int ncopies,
       {
         /* Here, we assign an additional cost for the unaligned load.  */
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-                                         unaligned_load, stmt_info,
+                                         unaligned_load, stmt_info, slp_node,
                                          misalignment, vect_body);
 
         if (dump_enabled_p ())
@@ -1097,16 +1097,18 @@ vect_get_load_cost (vec_info *, stmt_vec_info 
stmt_info, int ncopies,
     case dr_explicit_realign:
       {
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
-                                         vector_load, stmt_info, 0, vect_body);
+                                         vector_load, stmt_info, slp_node, 0,
+                                         vect_body);
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-                                         vec_perm, stmt_info, 0, vect_body);
+                                         vec_perm, stmt_info, slp_node, 0,
+                                         vect_body);
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
            prologue costs.  */
         if (targetm.vectorize.builtin_mask_for_load)
          *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
-                                           stmt_info, 0, vect_body);
+                                           stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1132,17 +1134,17 @@ vect_get_load_cost (vec_info *, stmt_vec_info 
stmt_info, int ncopies,
           {
            *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
                                                vector_stmt, stmt_info,
-                                               0, vect_prologue);
+                                               slp_node, 0, vect_prologue);
             if (targetm.vectorize.builtin_mask_for_load)
              *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
                                                  vector_stmt, stmt_info,
-                                                 0, vect_prologue);
+                                                 slp_node, 0, vect_prologue);
           }
 
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
-                                         stmt_info, 0, vect_body);
+                                         stmt_info, slp_node, 0, vect_body);
        *inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
-                                         stmt_info, 0, vect_body);
+                                         stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -8551,7 +8553,7 @@ vectorizable_store (vec_info *vinfo,
          if (vls_type == VLS_STORE_INVARIANT)
            prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
                                               stmt_info, 0, vect_prologue);
-         vect_get_store_cost (vinfo, stmt_info, ncopies,
+         vect_get_store_cost (vinfo, stmt_info, slp_node, ncopies,
                               alignment_support_scheme, misalignment,
                               &inside_cost, cost_vec);
 
@@ -8623,7 +8625,7 @@ vectorizable_store (vec_info *vinfo,
     else if (vls_type != VLS_STORE_INVARIANT)
       return;
     *prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
-                                       0, vect_prologue);
+                                       slp_node, 0, vect_prologue);
   };
 
   if (memory_access_type == VMAT_ELEMENTWISE
@@ -8890,7 +8892,7 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_stores > 0)
-           vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+           vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
                                 alignment_support_scheme, misalignment,
                                 &inside_cost, cost_vec);
          if (dump_enabled_p ())
@@ -9202,7 +9204,7 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_stores > 0)
-           vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+           vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
                                 alignment_support_scheme, misalignment,
                                 &inside_cost, cost_vec);
          if (dump_enabled_p ())
@@ -9227,7 +9229,8 @@ vectorizable_store (vec_info *vinfo,
            {
              if (costing_p && vls_type == VLS_STORE_INVARIANT)
                prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
-                                                  stmt_info, 0, vect_prologue);
+                                                  stmt_info, slp_node, 0,
+                                                  vect_prologue);
              else if (!costing_p)
                {
                  /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
@@ -9304,7 +9307,8 @@ vectorizable_store (vec_info *vinfo,
                      unsigned int cnunits = vect_nunits_for_cost (vectype);
                      inside_cost
                          += record_stmt_cost (cost_vec, cnunits, scalar_store,
-                                              stmt_info, 0, vect_body);
+                                              stmt_info, slp_node, 0,
+                                              vect_body);
                      continue;
                    }
 
@@ -9371,7 +9375,7 @@ vectorizable_store (vec_info *vinfo,
                      unsigned int cnunits = vect_nunits_for_cost (vectype);
                      inside_cost
                        += record_stmt_cost (cost_vec, cnunits, scalar_store,
-                                            stmt_info, 0, vect_body);
+                                            stmt_info, slp_node, 0, vect_body);
                      continue;
                    }
                  poly_uint64 offset_nunits
@@ -9478,14 +9482,14 @@ vectorizable_store (vec_info *vinfo,
                         consumed by the load).  */
                      inside_cost
                        += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-                                            stmt_info, 0, vect_body);
+                                            stmt_info, slp_node, 0, vect_body);
                      /* N scalar stores plus extracting the elements.  */
                      inside_cost
                        += record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-                                            stmt_info, 0, vect_body);
+                                            stmt_info, slp_node, 0, vect_body);
                      inside_cost
                        += record_stmt_cost (cost_vec, cnunits, scalar_store,
-                                            stmt_info, 0, vect_body);
+                                            stmt_info, slp_node, 0, vect_body);
                      continue;
                    }
 
@@ -9679,7 +9683,8 @@ vectorizable_store (vec_info *vinfo,
              int group_size = DR_GROUP_SIZE (first_stmt_info);
              int nstmts = ceil_log2 (group_size) * group_size;
              inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
-                                              stmt_info, 0, vect_body);
+                                              stmt_info, slp_node, 0,
+                                              vect_body);
              if (dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location,
                                 "vect_model_store_cost: "
@@ -9708,7 +9713,8 @@ vectorizable_store (vec_info *vinfo,
            {
              if (costing_p)
                inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
-                                                stmt_info, 0, vect_body);
+                                                stmt_info, slp_node, 0,
+                                                vect_body);
              else
                {
                  tree perm_mask = perm_mask_for_reverse (vectype);
@@ -9901,7 +9907,7 @@ vectorizable_store (vec_info *vinfo,
   if (costing_p)
     {
       if (n_adjacent_stores > 0)
-       vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+       vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
                             alignment_support_scheme, misalignment,
                             &inside_cost, cost_vec);
 
@@ -9927,11 +9933,11 @@ vectorizable_store (vec_info *vinfo,
                  /* Spill.  */
                  prologue_cost
                    += record_stmt_cost (cost_vec, ncopies, vector_store,
-                                        stmt_info, 0, vect_epilogue);
+                                        stmt_info, slp_node, 0, vect_epilogue);
                  /* Loads.  */
                  prologue_cost
                    += record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
-                                        stmt_info, 0, vect_epilogue);
+                                        stmt_info, slp_node, 0, vect_epilogue);
                }
            }
        }
@@ -10502,9 +10508,10 @@ vectorizable_load (vec_info *vinfo,
          enum vect_cost_model_location cost_loc
            = hoist_p ? vect_prologue : vect_body;
          unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
-                                               stmt_info, 0, cost_loc);
-         cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0,
-                                   cost_loc);
+                                               stmt_info, slp_node, 0,
+                                               cost_loc);
+         cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
+                                   slp_node, 0, cost_loc);
          unsigned int prologue_cost = hoist_p ? cost : 0;
          unsigned int inside_cost = hoist_p ? 0 : cost;
          if (dump_enabled_p ())
@@ -10725,7 +10732,8 @@ vectorizable_load (vec_info *vinfo,
                    n_adjacent_loads++;
                  else
                    inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
-                                                    stmt_info, 0, vect_body);
+                                                    stmt_info, slp_node, 0,
+                                                    vect_body);
                  continue;
                }
              tree this_off = build_int_cst (TREE_TYPE (alias_off),
@@ -10763,7 +10771,8 @@ vectorizable_load (vec_info *vinfo,
            {
              if (costing_p)
                inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
-                                                stmt_info, 0, vect_body);
+                                                stmt_info, slp_node, 0,
+                                                vect_body);
              else
                {
                  tree vec_inv = build_constructor (lvectype, v);
@@ -10809,7 +10818,8 @@ vectorizable_load (vec_info *vinfo,
              vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
                                            true, &n_perms, &n_loads);
              inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
-                                              first_stmt_info, 0, vect_body);
+                                              first_stmt_info, slp_node, 0,
+                                              vect_body);
            }
          else
            vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
@@ -10819,7 +10829,7 @@ vectorizable_load (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_loads > 0)
-           vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+           vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
                                alignment_support_scheme, misalignment, false,
                                &inside_cost, nullptr, cost_vec, cost_vec,
                                true);
@@ -11173,7 +11183,7 @@ vectorizable_load (vec_info *vinfo,
                                         "vect_model_load_cost: %d "
                                         "unused vectors.\n",
                                         gaps);
-                     vect_get_load_cost (vinfo, stmt_info, gaps,
+                     vect_get_load_cost (vinfo, stmt_info, slp_node, gaps,
                                          alignment_support_scheme,
                                          misalignment, false, &inside_cost,
                                          &prologue_cost, cost_vec, cost_vec,
@@ -11302,7 +11312,7 @@ vectorizable_load (vec_info *vinfo,
       if (costing_p)
        {
          if (n_adjacent_loads > 0)
-           vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+           vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
                                alignment_support_scheme, misalignment, false,
                                &inside_cost, &prologue_cost, cost_vec,
                                cost_vec, true);
@@ -11379,7 +11389,7 @@ vectorizable_load (vec_info *vinfo,
                      unsigned int cnunits = vect_nunits_for_cost (vectype);
                      inside_cost
                        = record_stmt_cost (cost_vec, cnunits, scalar_load,
-                                           stmt_info, 0, vect_body);
+                                           stmt_info, slp_node, 0, vect_body);
                      continue;
                    }
                  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
@@ -11455,7 +11465,7 @@ vectorizable_load (vec_info *vinfo,
                      unsigned int cnunits = vect_nunits_for_cost (vectype);
                      inside_cost
                        = record_stmt_cost (cost_vec, cnunits, scalar_load,
-                                           stmt_info, 0, vect_body);
+                                           stmt_info, slp_node, 0, vect_body);
                      continue;
                    }
                  poly_uint64 offset_nunits
@@ -11590,7 +11600,7 @@ vectorizable_load (vec_info *vinfo,
                         vector.  */
                      inside_cost
                        = record_stmt_cost (cost_vec, const_nunits, scalar_load,
-                                           stmt_info, 0, vect_body);
+                                           stmt_info, slp_node, 0, vect_body);
                      inside_cost
                        = record_stmt_cost (cost_vec, 1, vec_construct,
                                            stmt_info, slp_node, 0, vect_body);
@@ -12177,7 +12187,7 @@ vectorizable_load (vec_info *vinfo,
                  /* Leave realign cases alone to keep them simple.  */
                  if (alignment_support_scheme == dr_explicit_realign_optimized
                      || alignment_support_scheme == dr_explicit_realign)
-                   vect_get_load_cost (vinfo, stmt_info, 1,
+                   vect_get_load_cost (vinfo, stmt_info, slp_node, 1,
                                        alignment_support_scheme, misalignment,
                                        add_realign_cost, &inside_cost,
                                        &prologue_cost, cost_vec, cost_vec,
@@ -12250,7 +12260,8 @@ vectorizable_load (vec_info *vinfo,
            {
              if (costing_p)
                inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-                                               stmt_info, 0, vect_body);
+                                               stmt_info, slp_node, 0,
+                                               vect_body);
              else
                {
                  tree perm_mask = perm_mask_for_reverse (vectype);
@@ -12319,7 +12330,8 @@ vectorizable_load (vec_info *vinfo,
              vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
                                            true, &n_perms, nullptr);
              inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
-                                             stmt_info, 0, vect_body);
+                                             stmt_info, slp_node, 0,
+                                             vect_body);
            }
          else
            {
@@ -12346,7 +12358,8 @@ vectorizable_load (vec_info *vinfo,
                  int group_size = DR_GROUP_SIZE (first_stmt_info);
                  int nstmts = ceil_log2 (group_size) * group_size;
                  inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
-                                                  stmt_info, 0, vect_body);
+                                                  stmt_info, slp_node, 0,
+                                                  vect_body);
 
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_NOTE, vect_location,
@@ -12375,7 +12388,7 @@ vectorizable_load (vec_info *vinfo,
                  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
                  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
       if (n_adjacent_loads > 0)
-       vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+       vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
                            alignment_support_scheme, misalignment, false,
                            &inside_cost, &prologue_cost, cost_vec, cost_vec,
                            true);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 
dcad41dcf182045e868a83276e39ca71a82738d5..7f69a3f57b492ad9ecbd63ecdea27e9abe386ac5
 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2354,6 +2354,10 @@ extern unsigned record_stmt_cost (stmt_vector_for_cost 
*, int,
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
                                  enum vect_cost_for_stmt,
                                  enum vect_cost_model_location);
+extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
+                                 enum vect_cost_for_stmt, stmt_vec_info,
+                                 slp_tree, tree, int,
+                                 enum vect_cost_model_location);
 
 /* Overload of record_stmt_cost with VECTYPE derived from STMT_INFO.  */
 
@@ -2375,12 +2379,8 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, 
int count,
                  slp_tree node,
                  int misalign, enum vect_cost_model_location where)
 {
-  if (node)
-    return record_stmt_cost (body_cost_vec, count, kind, node,
-                            STMT_VINFO_VECTYPE (stmt_info), misalign, where);
-  else
-    return record_stmt_cost (body_cost_vec, count, kind, stmt_info,
-                            STMT_VINFO_VECTYPE (stmt_info), misalign, where);
+  return record_stmt_cost (body_cost_vec, count, kind, stmt_info, node,
+                          STMT_VINFO_VECTYPE (stmt_info), misalign, where);
 }
 
 extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *);
@@ -2411,12 +2411,12 @@ extern bool vect_nop_conversion_p (stmt_vec_info);
 extern opt_result vect_analyze_stmt (vec_info *, stmt_vec_info, bool *,
                                     slp_tree,
                                     slp_instance, stmt_vector_for_cost *);
-extern void vect_get_load_cost (vec_info *, stmt_vec_info, int,
+extern void vect_get_load_cost (vec_info *, stmt_vec_info, slp_tree, int,
                                dr_alignment_support, int, bool,
                                unsigned int *, unsigned int *,
                                stmt_vector_for_cost *,
                                stmt_vector_for_cost *, bool);
-extern void vect_get_store_cost (vec_info *, stmt_vec_info, int,
+extern void vect_get_store_cost (vec_info *, stmt_vec_info, slp_tree, int,
                                 dr_alignment_support, int,
                                 unsigned int *, stmt_vector_for_cost *);
 extern bool vect_supportable_shift (vec_info *, enum tree_code, tree);




-- 
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index 3ea5fb883b1a5289195142171eb45fa422910a95..d87ca79b8e4c16d242e67431d1b527bdb8cb74e4 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -1729,12 +1729,14 @@ vect_get_data_access_cost (vec_info *vinfo, dr_vec_info *dr_info,
     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
 
   if (DR_IS_READ (dr_info->dr))
-    vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
-			misalignment, true, inside_cost,
-			outside_cost, prologue_cost_vec, body_cost_vec, false);
+    vect_get_load_cost (vinfo, stmt_info, NULL, ncopies,
+			alignment_support_scheme, misalignment, true,
+			inside_cost, outside_cost, prologue_cost_vec,
+			body_cost_vec, false);
   else
-    vect_get_store_cost (vinfo,stmt_info, ncopies, alignment_support_scheme,
-			 misalignment, inside_cost, body_cost_vec);
+    vect_get_store_cost (vinfo,stmt_info, NULL, ncopies,
+			 alignment_support_scheme, misalignment, inside_cost,
+			 body_cost_vec);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7a92da00f7ddcfdf146fa1c2511f609e8bc40e9e..46543c15c00f00e5127d06446f58fce79951c3b0 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -93,7 +93,7 @@ stmt_in_inner_loop_p (vec_info *vinfo, class _stmt_vec_info *stmt_info)
    target model or by saving it in a vector for later processing.
    Return a preliminary estimate of the statement's cost.  */
 
-static unsigned
+unsigned
 record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
 		  enum vect_cost_for_stmt kind,
 		  stmt_vec_info stmt_info, slp_tree node,
@@ -1008,8 +1008,8 @@ cfun_returns (tree decl)
 
 /* Calculate cost of DR's memory access.  */
 void
-vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
-		     dr_alignment_support alignment_support_scheme,
+vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
+		     int ncopies, dr_alignment_support alignment_support_scheme,
 		     int misalignment,
 		     unsigned int *inside_cost,
 		     stmt_vector_for_cost *body_cost_vec)
@@ -1019,7 +1019,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
     case dr_aligned:
       {
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-					  vector_store, stmt_info, 0,
+					  vector_store, stmt_info, slp_node, 0,
 					  vect_body);
 
         if (dump_enabled_p ())
@@ -1032,7 +1032,7 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
       {
         /* Here, we assign an additional cost for the unaligned store.  */
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-					  unaligned_store, stmt_info,
+					  unaligned_store, stmt_info, slp_node,
 					  misalignment, vect_body);
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1058,8 +1058,8 @@ vect_get_store_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
 
 /* Calculate cost of DR's memory access.  */
 void
-vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
-		    dr_alignment_support alignment_support_scheme,
+vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, slp_tree slp_node,
+		    int ncopies, dr_alignment_support alignment_support_scheme,
 		    int misalignment,
 		    bool add_realign_cost, unsigned int *inside_cost,
 		    unsigned int *prologue_cost,
@@ -1072,7 +1072,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
     case dr_aligned:
       {
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
-					  stmt_info, 0, vect_body);
+					  stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1084,7 +1084,7 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
       {
         /* Here, we assign an additional cost for the unaligned load.  */
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-					  unaligned_load, stmt_info,
+					  unaligned_load, stmt_info, slp_node,
 					  misalignment, vect_body);
 
         if (dump_enabled_p ())
@@ -1097,16 +1097,18 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
     case dr_explicit_realign:
       {
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies * 2,
-					  vector_load, stmt_info, 0, vect_body);
+					  vector_load, stmt_info, slp_node, 0,
+					  vect_body);
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies,
-					  vec_perm, stmt_info, 0, vect_body);
+					  vec_perm, stmt_info, slp_node, 0,
+					  vect_body);
 
         /* FIXME: If the misalignment remains fixed across the iterations of
            the containing loop, the following cost should be added to the
            prologue costs.  */
         if (targetm.vectorize.builtin_mask_for_load)
 	  *inside_cost += record_stmt_cost (body_cost_vec, 1, vector_stmt,
-					    stmt_info, 0, vect_body);
+					    stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -1132,17 +1134,17 @@ vect_get_load_cost (vec_info *, stmt_vec_info stmt_info, int ncopies,
           {
 	    *prologue_cost += record_stmt_cost (prologue_cost_vec, 2,
 						vector_stmt, stmt_info,
-						0, vect_prologue);
+						slp_node, 0, vect_prologue);
             if (targetm.vectorize.builtin_mask_for_load)
 	      *prologue_cost += record_stmt_cost (prologue_cost_vec, 1,
 						  vector_stmt, stmt_info,
-						  0, vect_prologue);
+						  slp_node, 0, vect_prologue);
           }
 
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies, vector_load,
-					  stmt_info, 0, vect_body);
+					  stmt_info, slp_node, 0, vect_body);
 	*inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_perm,
-					  stmt_info, 0, vect_body);
+					  stmt_info, slp_node, 0, vect_body);
 
         if (dump_enabled_p ())
           dump_printf_loc (MSG_NOTE, vect_location,
@@ -8551,7 +8553,7 @@ vectorizable_store (vec_info *vinfo,
 	  if (vls_type == VLS_STORE_INVARIANT)
 	    prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
 					       stmt_info, 0, vect_prologue);
-	  vect_get_store_cost (vinfo, stmt_info, ncopies,
+	  vect_get_store_cost (vinfo, stmt_info, slp_node, ncopies,
 			       alignment_support_scheme, misalignment,
 			       &inside_cost, cost_vec);
 
@@ -8623,7 +8625,7 @@ vectorizable_store (vec_info *vinfo,
     else if (vls_type != VLS_STORE_INVARIANT)
       return;
     *prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
-					0, vect_prologue);
+					slp_node, 0, vect_prologue);
   };
 
   if (memory_access_type == VMAT_ELEMENTWISE
@@ -8890,7 +8892,7 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
 	{
 	  if (n_adjacent_stores > 0)
-	    vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+	    vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
 				 alignment_support_scheme, misalignment,
 				 &inside_cost, cost_vec);
 	  if (dump_enabled_p ())
@@ -9202,7 +9204,7 @@ vectorizable_store (vec_info *vinfo,
       if (costing_p)
 	{
 	  if (n_adjacent_stores > 0)
-	    vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+	    vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
 				 alignment_support_scheme, misalignment,
 				 &inside_cost, cost_vec);
 	  if (dump_enabled_p ())
@@ -9227,7 +9229,8 @@ vectorizable_store (vec_info *vinfo,
 	    {
 	      if (costing_p && vls_type == VLS_STORE_INVARIANT)
 		prologue_cost += record_stmt_cost (cost_vec, 1, scalar_to_vec,
-						   stmt_info, 0, vect_prologue);
+						   stmt_info, slp_node, 0,
+						   vect_prologue);
 	      else if (!costing_p)
 		{
 		  /* Since the store is not grouped, DR_GROUP_SIZE is 1, and
@@ -9304,7 +9307,8 @@ vectorizable_store (vec_info *vinfo,
 		      unsigned int cnunits = vect_nunits_for_cost (vectype);
 		      inside_cost
 			  += record_stmt_cost (cost_vec, cnunits, scalar_store,
-					       stmt_info, 0, vect_body);
+					       stmt_info, slp_node, 0,
+					       vect_body);
 		      continue;
 		    }
 
@@ -9371,7 +9375,7 @@ vectorizable_store (vec_info *vinfo,
 		      unsigned int cnunits = vect_nunits_for_cost (vectype);
 		      inside_cost
 			+= record_stmt_cost (cost_vec, cnunits, scalar_store,
-					     stmt_info, 0, vect_body);
+					     stmt_info, slp_node, 0, vect_body);
 		      continue;
 		    }
 		  poly_uint64 offset_nunits
@@ -9478,14 +9482,14 @@ vectorizable_store (vec_info *vinfo,
 			 consumed by the load).  */
 		      inside_cost
 			+= record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-					     stmt_info, 0, vect_body);
+					     stmt_info, slp_node, 0, vect_body);
 		      /* N scalar stores plus extracting the elements.  */
 		      inside_cost
 			+= record_stmt_cost (cost_vec, cnunits, vec_to_scalar,
-					     stmt_info, 0, vect_body);
+					     stmt_info, slp_node, 0, vect_body);
 		      inside_cost
 			+= record_stmt_cost (cost_vec, cnunits, scalar_store,
-					     stmt_info, 0, vect_body);
+					     stmt_info, slp_node, 0, vect_body);
 		      continue;
 		    }
 
@@ -9679,7 +9683,8 @@ vectorizable_store (vec_info *vinfo,
 	      int group_size = DR_GROUP_SIZE (first_stmt_info);
 	      int nstmts = ceil_log2 (group_size) * group_size;
 	      inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
-					       stmt_info, 0, vect_body);
+					       stmt_info, slp_node, 0,
+					       vect_body);
 	      if (dump_enabled_p ())
 		dump_printf_loc (MSG_NOTE, vect_location,
 				 "vect_model_store_cost: "
@@ -9708,7 +9713,8 @@ vectorizable_store (vec_info *vinfo,
 	    {
 	      if (costing_p)
 		inside_cost += record_stmt_cost (cost_vec, 1, vec_perm,
-						 stmt_info, 0, vect_body);
+						 stmt_info, slp_node, 0,
+						 vect_body);
 	      else
 		{
 		  tree perm_mask = perm_mask_for_reverse (vectype);
@@ -9901,7 +9907,7 @@ vectorizable_store (vec_info *vinfo,
   if (costing_p)
     {
       if (n_adjacent_stores > 0)
-	vect_get_store_cost (vinfo, stmt_info, n_adjacent_stores,
+	vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
 			     alignment_support_scheme, misalignment,
 			     &inside_cost, cost_vec);
 
@@ -9927,11 +9933,11 @@ vectorizable_store (vec_info *vinfo,
 		  /* Spill.  */
 		  prologue_cost
 		    += record_stmt_cost (cost_vec, ncopies, vector_store,
-					 stmt_info, 0, vect_epilogue);
+					 stmt_info, slp_node, 0, vect_epilogue);
 		  /* Loads.  */
 		  prologue_cost
 		    += record_stmt_cost (cost_vec, ncopies * nregs, scalar_load,
-					 stmt_info, 0, vect_epilogue);
+					 stmt_info, slp_node, 0, vect_epilogue);
 		}
 	    }
 	}
@@ -10502,9 +10508,10 @@ vectorizable_load (vec_info *vinfo,
 	  enum vect_cost_model_location cost_loc
 	    = hoist_p ? vect_prologue : vect_body;
 	  unsigned int cost = record_stmt_cost (cost_vec, 1, scalar_load,
-						stmt_info, 0, cost_loc);
-	  cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info, 0,
-				    cost_loc);
+						stmt_info, slp_node, 0,
+						cost_loc);
+	  cost += record_stmt_cost (cost_vec, 1, scalar_to_vec, stmt_info,
+				    slp_node, 0, cost_loc);
 	  unsigned int prologue_cost = hoist_p ? cost : 0;
 	  unsigned int inside_cost = hoist_p ? 0 : cost;
 	  if (dump_enabled_p ())
@@ -10725,7 +10732,8 @@ vectorizable_load (vec_info *vinfo,
 		    n_adjacent_loads++;
 		  else
 		    inside_cost += record_stmt_cost (cost_vec, 1, scalar_load,
-						     stmt_info, 0, vect_body);
+						     stmt_info, slp_node, 0,
+						     vect_body);
 		  continue;
 		}
 	      tree this_off = build_int_cst (TREE_TYPE (alias_off),
@@ -10763,7 +10771,8 @@ vectorizable_load (vec_info *vinfo,
 	    {
 	      if (costing_p)
 		inside_cost += record_stmt_cost (cost_vec, 1, vec_construct,
-						 stmt_info, 0, vect_body);
+						 stmt_info, slp_node, 0,
+						 vect_body);
 	      else
 		{
 		  tree vec_inv = build_constructor (lvectype, v);
@@ -10809,7 +10818,8 @@ vectorizable_load (vec_info *vinfo,
 	      vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
 					    true, &n_perms, &n_loads);
 	      inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
-					       first_stmt_info, 0, vect_body);
+					       first_stmt_info, slp_node, 0,
+					       vect_body);
 	    }
 	  else
 	    vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
@@ -10819,7 +10829,7 @@ vectorizable_load (vec_info *vinfo,
       if (costing_p)
 	{
 	  if (n_adjacent_loads > 0)
-	    vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+	    vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
 				alignment_support_scheme, misalignment, false,
 				&inside_cost, nullptr, cost_vec, cost_vec,
 				true);
@@ -11173,7 +11183,7 @@ vectorizable_load (vec_info *vinfo,
 					 "vect_model_load_cost: %d "
 					 "unused vectors.\n",
 					 gaps);
-		      vect_get_load_cost (vinfo, stmt_info, gaps,
+		      vect_get_load_cost (vinfo, stmt_info, slp_node, gaps,
 					  alignment_support_scheme,
 					  misalignment, false, &inside_cost,
 					  &prologue_cost, cost_vec, cost_vec,
@@ -11302,7 +11312,7 @@ vectorizable_load (vec_info *vinfo,
       if (costing_p)
 	{
 	  if (n_adjacent_loads > 0)
-	    vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+	    vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
 				alignment_support_scheme, misalignment, false,
 				&inside_cost, &prologue_cost, cost_vec,
 				cost_vec, true);
@@ -11379,7 +11389,7 @@ vectorizable_load (vec_info *vinfo,
 		      unsigned int cnunits = vect_nunits_for_cost (vectype);
 		      inside_cost
 			= record_stmt_cost (cost_vec, cnunits, scalar_load,
-					    stmt_info, 0, vect_body);
+					    stmt_info, slp_node, 0, vect_body);
 		      continue;
 		    }
 		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
@@ -11455,7 +11465,7 @@ vectorizable_load (vec_info *vinfo,
 		      unsigned int cnunits = vect_nunits_for_cost (vectype);
 		      inside_cost
 			= record_stmt_cost (cost_vec, cnunits, scalar_load,
-					    stmt_info, 0, vect_body);
+					    stmt_info, slp_node, 0, vect_body);
 		      continue;
 		    }
 		  poly_uint64 offset_nunits
@@ -11590,7 +11600,7 @@ vectorizable_load (vec_info *vinfo,
 			 vector.  */
 		      inside_cost
 			= record_stmt_cost (cost_vec, const_nunits, scalar_load,
-					    stmt_info, 0, vect_body);
+					    stmt_info, slp_node, 0, vect_body);
 		      inside_cost
 			= record_stmt_cost (cost_vec, 1, vec_construct,
 					    stmt_info, slp_node, 0, vect_body);
@@ -12177,7 +12187,7 @@ vectorizable_load (vec_info *vinfo,
 		  /* Leave realign cases alone to keep them simple.  */
 		  if (alignment_support_scheme == dr_explicit_realign_optimized
 		      || alignment_support_scheme == dr_explicit_realign)
-		    vect_get_load_cost (vinfo, stmt_info, 1,
+		    vect_get_load_cost (vinfo, stmt_info, slp_node, 1,
 					alignment_support_scheme, misalignment,
 					add_realign_cost, &inside_cost,
 					&prologue_cost, cost_vec, cost_vec,
@@ -12250,7 +12260,8 @@ vectorizable_load (vec_info *vinfo,
 	    {
 	      if (costing_p)
 		inside_cost = record_stmt_cost (cost_vec, 1, vec_perm,
-						stmt_info, 0, vect_body);
+						stmt_info, slp_node, 0,
+						vect_body);
 	      else
 		{
 		  tree perm_mask = perm_mask_for_reverse (vectype);
@@ -12319,7 +12330,8 @@ vectorizable_load (vec_info *vinfo,
 	      vect_transform_slp_perm_load (vinfo, slp_node, vNULL, nullptr, vf,
 					    true, &n_perms, nullptr);
 	      inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
-					      stmt_info, 0, vect_body);
+					      stmt_info, slp_node, 0,
+					      vect_body);
 	    }
 	  else
 	    {
@@ -12346,7 +12358,8 @@ vectorizable_load (vec_info *vinfo,
 		  int group_size = DR_GROUP_SIZE (first_stmt_info);
 		  int nstmts = ceil_log2 (group_size) * group_size;
 		  inside_cost += record_stmt_cost (cost_vec, nstmts, vec_perm,
-						   stmt_info, 0, vect_body);
+						   stmt_info, slp_node, 0,
+						   vect_body);
 
 		  if (dump_enabled_p ())
 		    dump_printf_loc (MSG_NOTE, vect_location,
@@ -12375,7 +12388,7 @@ vectorizable_load (vec_info *vinfo,
 		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
 		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
       if (n_adjacent_loads > 0)
-	vect_get_load_cost (vinfo, stmt_info, n_adjacent_loads,
+	vect_get_load_cost (vinfo, stmt_info, slp_node, n_adjacent_loads,
 			    alignment_support_scheme, misalignment, false,
 			    &inside_cost, &prologue_cost, cost_vec, cost_vec,
 			    true);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index dcad41dcf182045e868a83276e39ca71a82738d5..7f69a3f57b492ad9ecbd63ecdea27e9abe386ac5 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2354,6 +2354,10 @@ extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
 				  enum vect_cost_for_stmt,
 				  enum vect_cost_model_location);
+extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
+				  enum vect_cost_for_stmt, stmt_vec_info,
+				  slp_tree, tree, int,
+				  enum vect_cost_model_location);
 
 /* Overload of record_stmt_cost with VECTYPE derived from STMT_INFO.  */
 
@@ -2375,12 +2379,8 @@ record_stmt_cost (stmt_vector_for_cost *body_cost_vec, int count,
 		  slp_tree node,
 		  int misalign, enum vect_cost_model_location where)
 {
-  if (node)
-    return record_stmt_cost (body_cost_vec, count, kind, node,
-			     STMT_VINFO_VECTYPE (stmt_info), misalign, where);
-  else
-    return record_stmt_cost (body_cost_vec, count, kind, stmt_info,
-			     STMT_VINFO_VECTYPE (stmt_info), misalign, where);
+  return record_stmt_cost (body_cost_vec, count, kind, stmt_info, node,
+			   STMT_VINFO_VECTYPE (stmt_info), misalign, where);
 }
 
 extern void vect_finish_replace_stmt (vec_info *, stmt_vec_info, gimple *);
@@ -2411,12 +2411,12 @@ extern bool vect_nop_conversion_p (stmt_vec_info);
 extern opt_result vect_analyze_stmt (vec_info *, stmt_vec_info, bool *,
 				     slp_tree,
 				     slp_instance, stmt_vector_for_cost *);
-extern void vect_get_load_cost (vec_info *, stmt_vec_info, int,
+extern void vect_get_load_cost (vec_info *, stmt_vec_info, slp_tree, int,
 				dr_alignment_support, int, bool,
 				unsigned int *, unsigned int *,
 				stmt_vector_for_cost *,
 				stmt_vector_for_cost *, bool);
-extern void vect_get_store_cost (vec_info *, stmt_vec_info, int,
+extern void vect_get_store_cost (vec_info *, stmt_vec_info, slp_tree, int,
 				 dr_alignment_support, int,
 				 unsigned int *, stmt_vector_for_cost *);
 extern bool vect_supportable_shift (vec_info *, enum tree_code, tree);



Reply via email to