[PATCH 5/9] vect: Adjust vectorizable_load costing on VMAT_GATHER_SCATTER

Kewen Lin via Gcc-patches Mon, 12 Jun 2023 19:07:21 -0700

This patch adjusts the cost handling on VMAT_GATHER_SCATTER
in function vectorizable_load.  We don't call function
vect_model_load_cost for it any more.


It's mainly for gather loads with IFN or emulated gather
loads, it follows the handlings in function
vect_model_load_cost.  This patch shouldn't have any
functional changes.

gcc/ChangeLog:

        * tree-vect-stmts.cc (vectorizable_load): Adjust the cost handling on
        VMAT_GATHER_SCATTER without calling vect_model_load_cost.
        (vect_model_load_cost): Adjut the assertion on VMAT_GATHER_SCATTER,
        remove VMAT_GATHER_SCATTER related handlings and the related parameter
        gs_info.
---
 gcc/tree-vect-stmts.cc | 123 +++++++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 48 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 651dc800380..a3fd0bf879e 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1131,11 +1131,10 @@ vect_model_load_cost (vec_info *vinfo,
                      vect_memory_access_type memory_access_type,
                      dr_alignment_support alignment_support_scheme,
                      int misalignment,
-                     gather_scatter_info *gs_info,
                      slp_tree slp_node,
                      stmt_vector_for_cost *cost_vec)
 {
-  gcc_assert ((memory_access_type != VMAT_GATHER_SCATTER || !gs_info->decl)
+  gcc_assert (memory_access_type != VMAT_GATHER_SCATTER
              && memory_access_type != VMAT_INVARIANT
              && memory_access_type != VMAT_ELEMENTWISE
              && memory_access_type != VMAT_STRIDED_SLP);
@@ -1222,35 +1221,9 @@ vect_model_load_cost (vec_info *vinfo,
                          group_size);
     }
 
-  /* The loads themselves.  */
-  if (memory_access_type == VMAT_GATHER_SCATTER)
-    {
-      tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-      unsigned int assumed_nunits = vect_nunits_for_cost (vectype);
-      if (memory_access_type == VMAT_GATHER_SCATTER
-         && gs_info->ifn == IFN_LAST && !gs_info->decl)
-       /* For emulated gathers N offset vector element extracts
-          (we assume the scalar scaling and ptr + offset add is consumed by
-          the load).  */
-       inside_cost += record_stmt_cost (cost_vec, ncopies * assumed_nunits,
-                                        vec_to_scalar, stmt_info, 0,
-                                        vect_body);
-      /* N scalar loads plus gathering them into a vector.  */
-      inside_cost += record_stmt_cost (cost_vec,
-                                      ncopies * assumed_nunits,
-                                      scalar_load, stmt_info, 0, vect_body);
-    }
-  else
-    vect_get_load_cost (vinfo, stmt_info, ncopies,
-                       alignment_support_scheme, misalignment, first_stmt_p,
-                       &inside_cost, &prologue_cost, 
-                       cost_vec, cost_vec, true);
-
-  if (memory_access_type == VMAT_GATHER_SCATTER
-      && gs_info->ifn == IFN_LAST
-      && !gs_info->decl)
-    inside_cost += record_stmt_cost (cost_vec, ncopies, vec_construct,
-                                    stmt_info, 0, vect_body);
+  vect_get_load_cost (vinfo, stmt_info, ncopies, alignment_support_scheme,
+                     misalignment, first_stmt_p, &inside_cost, &prologue_cost,
+                     cost_vec, cost_vec, true);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -10137,6 +10110,7 @@ vectorizable_load (vec_info *vinfo,
     }
   tree vec_mask = NULL_TREE;
   poly_uint64 group_elt = 0;
+  unsigned int inside_cost = 0;
   for (j = 0; j < ncopies; j++)
     {
       /* 1. Create the vector or array pointer update chain.  */
@@ -10268,23 +10242,25 @@ vectorizable_load (vec_info *vinfo,
          /* Record that VEC_ARRAY is now dead.  */
          vect_clobber_variable (vinfo, stmt_info, gsi, vec_array);
        }
-      else if (!costing_p)
+      else
        {
          for (i = 0; i < vec_num; i++)
            {
              tree final_mask = NULL_TREE;
-             if (loop_masks
-                 && memory_access_type != VMAT_INVARIANT)
-               final_mask = vect_get_loop_mask (gsi, loop_masks,
-                                                vec_num * ncopies,
-                                                vectype, vec_num * j + i);
-             if (vec_mask)
-               final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
-                                              final_mask, vec_mask, gsi);
-
-             if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-               dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
-                                              gsi, stmt_info, bump);
+             if (!costing_p)
+               {
+                 if (loop_masks && memory_access_type != VMAT_INVARIANT)
+                   final_mask
+                     = vect_get_loop_mask (gsi, loop_masks, vec_num * ncopies,
+                                           vectype, vec_num * j + i);
+                 if (vec_mask)
+                   final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+                                                  final_mask, vec_mask, gsi);
+
+                 if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+                   dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+                                                  gsi, stmt_info, bump);
+               }
 
              /* 2. Create the vector-load in the loop.  */
              switch (alignment_support_scheme)
@@ -10298,6 +10274,16 @@ vectorizable_load (vec_info *vinfo,
                    if (memory_access_type == VMAT_GATHER_SCATTER
                        && gs_info.ifn != IFN_LAST)
                      {
+                       if (costing_p)
+                         {
+                           unsigned int cnunits
+                             = vect_nunits_for_cost (vectype);
+                           inside_cost
+                             = record_stmt_cost (cost_vec, cnunits,
+                                                 scalar_load, stmt_info, 0,
+                                                 vect_body);
+                           goto vec_num_loop_costing_end;
+                         }
                        if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
                          vec_offset = vec_offsets[vec_num * j + i];
                        tree zero = build_zero_cst (vectype);
@@ -10322,6 +10308,25 @@ vectorizable_load (vec_info *vinfo,
                        gcc_assert (!final_mask);
                        unsigned HOST_WIDE_INT const_nunits
                          = nunits.to_constant ();
+                       if (costing_p)
+                         {
+                           /* For emulated gathers N offset vector element
+                              offset add is consumed by the load).  */
+                           inside_cost
+                             = record_stmt_cost (cost_vec, const_nunits,
+                                                 vec_to_scalar, stmt_info, 0,
+                                                 vect_body);
+                           /* N scalar loads plus gathering them into a
+                              vector.  */
+                           inside_cost
+                             = record_stmt_cost (cost_vec, const_nunits,
+                                                 scalar_load, stmt_info, 0,
+                                                 vect_body);
+                           inside_cost
+                             = record_stmt_cost (cost_vec, 1, vec_construct,
+                                                 stmt_info, 0, vect_body);
+                           goto vec_num_loop_costing_end;
+                         }
                        unsigned HOST_WIDE_INT const_offset_nunits
                          = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
                              .to_constant ();
@@ -10374,6 +10379,9 @@ vectorizable_load (vec_info *vinfo,
                        break;
                      }
 
+                   if (costing_p)
+                     goto vec_num_loop_costing_end;
+
                    align =
                      known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
                    if (alignment_support_scheme == dr_aligned)
@@ -10544,6 +10552,8 @@ vectorizable_load (vec_info *vinfo,
                  }
                case dr_explicit_realign:
                  {
+                   if (costing_p)
+                     goto vec_num_loop_costing_end;
                    tree ptr, bump;
 
                    tree vs = size_int (TYPE_VECTOR_SUBPARTS (vectype));
@@ -10606,6 +10616,8 @@ vectorizable_load (vec_info *vinfo,
                  }
                case dr_explicit_realign_optimized:
                  {
+                   if (costing_p)
+                     goto vec_num_loop_costing_end;
                    if (TREE_CODE (dataref_ptr) == SSA_NAME)
                      new_temp = copy_ssa_name (dataref_ptr);
                    else
@@ -10702,10 +10714,14 @@ vectorizable_load (vec_info *vinfo,
                                                 gsi, stmt_info, bump);
                  group_elt = 0;
                }
+vec_num_loop_costing_end:
+             ;
            }
          /* Bump the vector pointer to account for a gap or for excess
             elements loaded for a permuted SLP load.  */
-         if (maybe_ne (group_gap_adj, 0U) && slp_perm)
+         if (!costing_p
+             && maybe_ne (group_gap_adj, 0U)
+             && slp_perm)
            {
              poly_wide_int bump_val
                = (wi::to_wide (TYPE_SIZE_UNIT (elem_type))
@@ -10754,9 +10770,20 @@ vectorizable_load (vec_info *vinfo,
     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 
   if (costing_p)
-    vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type,
-                         alignment_support_scheme, misalignment, &gs_info,
-                         slp_node, cost_vec);
+    {
+      if (memory_access_type == VMAT_GATHER_SCATTER)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "vect_model_load_cost: inside_cost = %u, "
+                            "prologue_cost = 0 .\n",
+                            inside_cost);
+       }
+      else
+       vect_model_load_cost (vinfo, stmt_info, ncopies, vf, memory_access_type,
+                             alignment_support_scheme, misalignment, slp_node,
+                             cost_vec);
+    }
 
   return true;
 }
-- 
2.31.1

[PATCH 5/9] vect: Adjust vectorizable_load costing on VMAT_GATHER_SCATTER

Reply via email to