Passed the x86 bootstrap and regression tests.

Pan

-----Original Message-----
From: Juzhe-Zhong <juzhe.zh...@rivai.ai> 
Sent: Tuesday, October 31, 2023 6:08 PM
To: gcc-patches@gcc.gnu.org
Cc: richard.sandif...@arm.com; rguent...@suse.de; jeffreya...@gmail.com; 
Juzhe-Zhong <juzhe.zh...@rivai.ai>
Subject: [PATCH] VECT: Support mask_len_strided_load/mask_len_strided_store in 
loop vectorize

This patch support loop vectorizer generate direct strided load/store IFN
if targets enable it.

Note that this patch provide the ability that target enabling strided 
load/store but without gather/scatter
can vectorize stride memory access.

gcc/ChangeLog:

        * optabs-query.cc (supports_vec_gather_load_p): Support strided 
load/store.
        (supports_vec_scatter_store_p): Ditto.
        * optabs-query.h (supports_vec_gather_load_p): Ditto.
        (supports_vec_scatter_store_p): Ditto.
        * tree-vect-data-refs.cc (vect_gather_scatter_fn_p): Ditto.
        (vect_check_gather_scatter): Ditto.
        * tree-vect-stmts.cc (check_load_store_for_partial_vectors): Ditto.
        (vect_truncate_gather_scatter_offset): Ditto.
        (vect_use_strided_gather_scatters_p): Ditto.
        (vect_get_strided_load_store_ops): Ditto.
        (vectorizable_store): Ditto.
        (vectorizable_load): Ditto.
        * tree-vectorizer.h (vect_gather_scatter_fn_p): Ditto.

---
 gcc/optabs-query.cc        | 27 ++++++++++-----
 gcc/optabs-query.h         |  4 +--
 gcc/tree-vect-data-refs.cc | 71 ++++++++++++++++++++++++++++----------
 gcc/tree-vect-stmts.cc     | 46 +++++++++++++++++-------
 gcc/tree-vectorizer.h      |  3 +-
 5 files changed, 109 insertions(+), 42 deletions(-)

diff --git a/gcc/optabs-query.cc b/gcc/optabs-query.cc
index 947ccef218c..ea594baf15d 100644
--- a/gcc/optabs-query.cc
+++ b/gcc/optabs-query.cc
@@ -670,14 +670,19 @@ supports_vec_convert_optab_p (optab op, machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_gather_load_p (machine_mode mode)
+supports_vec_gather_load_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_gather_load[mode])
     this_fn_optabs->supports_vec_gather_load[mode]
       = (supports_vec_convert_optab_p (gather_load_optab, mode)
-        || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
-        || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
-        ? 1 : -1);
+            || supports_vec_convert_optab_p (mask_gather_load_optab, mode)
+            || supports_vec_convert_optab_p (mask_len_gather_load_optab, mode)
+            || (strided_p
+                && convert_optab_handler (mask_len_strided_load_optab, mode,
+                                          Pmode)
+                     != CODE_FOR_nothing)
+          ? 1
+          : -1);
 
   return this_fn_optabs->supports_vec_gather_load[mode] > 0;
 }
@@ -687,14 +692,20 @@ supports_vec_gather_load_p (machine_mode mode)
    for at least one vector mode.  */
 
 bool
-supports_vec_scatter_store_p (machine_mode mode)
+supports_vec_scatter_store_p (machine_mode mode, bool strided_p)
 {
   if (!this_fn_optabs->supports_vec_scatter_store[mode])
     this_fn_optabs->supports_vec_scatter_store[mode]
       = (supports_vec_convert_optab_p (scatter_store_optab, mode)
-        || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
-        || supports_vec_convert_optab_p (mask_len_scatter_store_optab, mode)
-        ? 1 : -1);
+            || supports_vec_convert_optab_p (mask_scatter_store_optab, mode)
+            || supports_vec_convert_optab_p (mask_len_scatter_store_optab,
+                                             mode)
+            || (strided_p
+                && convert_optab_handler (mask_len_strided_store_optab, mode,
+                                          Pmode)
+                     != CODE_FOR_nothing)
+          ? 1
+          : -1);
 
   return this_fn_optabs->supports_vec_scatter_store[mode] > 0;
 }
diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 920eb6a1b67..7c22edc5a78 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -191,8 +191,8 @@ bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool can_atomic_load_p (machine_mode);
 bool lshift_cheap_p (bool);
-bool supports_vec_gather_load_p (machine_mode = E_VOIDmode);
-bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode);
+bool supports_vec_gather_load_p (machine_mode = E_VOIDmode, bool = false);
+bool supports_vec_scatter_store_p (machine_mode = E_VOIDmode, bool = false);
 bool can_vec_extract (machine_mode, machine_mode);
 
 /* Version of find_widening_optab_handler_and_mode that operates on
diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
index d5c9c4a11c2..d374849b0a7 100644
--- a/gcc/tree-vect-data-refs.cc
+++ b/gcc/tree-vect-data-refs.cc
@@ -3913,9 +3913,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info 
loop_vinfo)
    *IFN_OUT and the vector type for the offset in *OFFSET_VECTYPE_OUT.  */
 
 bool
-vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, bool masked_p,
-                         tree vectype, tree memory_type, tree offset_type,
-                         int scale, internal_fn *ifn_out,
+vect_gather_scatter_fn_p (vec_info *vinfo, bool strided_p, bool read_p,
+                         bool masked_p, tree vectype, tree memory_type,
+                         tree offset_type, int scale, internal_fn *ifn_out,
                          tree *offset_vectype_out)
 {
   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
@@ -3926,7 +3926,7 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
     return false;
 
   /* Work out which function we need.  */
-  internal_fn ifn, alt_ifn, alt_ifn2;
+  internal_fn ifn, alt_ifn, alt_ifn2, alt_ifn3;
   if (read_p)
     {
       ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
@@ -3935,6 +3935,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
         use MASK_LEN_GATHER_LOAD regardless whether len and
         mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_GATHER_LOAD;
+      /* When target supports MASK_LEN_STRIDED_LOAD, we can relax the
+        restrictions around the relationship of the vector offset type
+        to the loaded by using a gather load with strided access.
+        E.g. a "gather" of N bytes with a 64-bit stride would in principle
+        be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_LOAD;
     }
   else
     {
@@ -3944,6 +3950,12 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
         use MASK_LEN_SCATTER_STORE regardless whether len and
         mask are valid or not.  */
       alt_ifn2 = IFN_MASK_LEN_SCATTER_STORE;
+      /* When target supports MASK_LEN_STRIDED_STORE, we can relax the
+        restrictions around the relationship of the vector offset type
+        to the stored by using a scatter store with strided access.
+        E.g. a "scatter" of N bytes with a 64-bit stride would in principle
+        be possible without needing an Nx64-bit vector offset type.  */
+      alt_ifn3 = IFN_MASK_LEN_STRIDED_STORE;
     }
 
   for (;;)
@@ -3953,8 +3965,20 @@ vect_gather_scatter_fn_p (vec_info *vinfo, bool read_p, 
bool masked_p,
        return false;
 
       /* Test whether the target supports this combination.  */
-      if (internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
-                                                 offset_vectype, scale))
+      /* We don't need to check whether target supports gather/scatter IFN
+        with expected vector offset for gather/scatter with a strided access
+        when target itself support strided load/store IFN.  */
+      if (strided_p
+         && internal_strided_fn_supported_p (alt_ifn3, vectype, offset_type,
+                                             scale))
+       {
+         *ifn_out = alt_ifn3;
+         *offset_vectype_out = offset_vectype;
+         return true;
+       }
+      else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
+                                                      memory_type,
+                                                      offset_vectype, scale))
        {
          *ifn_out = ifn;
          *offset_vectype_out = offset_vectype;
@@ -4047,9 +4071,12 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
loop_vec_info loop_vinfo,
 
   /* True if we should aim to use internal functions rather than
      built-in functions.  */
-  bool use_ifn_p = (DR_IS_READ (dr)
-                   ? supports_vec_gather_load_p (TYPE_MODE (vectype))
-                   : supports_vec_scatter_store_p (TYPE_MODE (vectype)));
+  bool use_ifn_p
+    = (DR_IS_READ (dr)
+        ? supports_vec_gather_load_p (TYPE_MODE (vectype),
+                                      STMT_VINFO_STRIDED_P (stmt_info))
+        : supports_vec_scatter_store_p (TYPE_MODE (vectype),
+                                        STMT_VINFO_STRIDED_P (stmt_info)));
 
   base = DR_REF (dr);
   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
@@ -4196,13 +4223,17 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
loop_vec_info loop_vinfo,
              /* Only treat this as a scaling operation if the target
                 supports it for at least some offset type.  */
              if (use_ifn_p
-                 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-                                               masked_p, vectype, memory_type,
+                 && !vect_gather_scatter_fn_p (loop_vinfo,
+                                               STMT_VINFO_STRIDED_P 
(stmt_info),
+                                               DR_IS_READ (dr), masked_p,
+                                               vectype, memory_type,
                                                signed_char_type_node,
                                                new_scale, &ifn,
                                                &offset_vectype)
-                 && !vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-                                               masked_p, vectype, memory_type,
+                 && !vect_gather_scatter_fn_p (loop_vinfo,
+                                               STMT_VINFO_STRIDED_P 
(stmt_info),
+                                               DR_IS_READ (dr), masked_p,
+                                               vectype, memory_type,
                                                unsigned_char_type_node,
                                                new_scale, &ifn,
                                                &offset_vectype))
@@ -4225,8 +4256,10 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
loop_vec_info loop_vinfo,
          if (use_ifn_p
              && TREE_CODE (off) == SSA_NAME
              && !POINTER_TYPE_P (TREE_TYPE (off))
-             && vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr),
-                                          masked_p, vectype, memory_type,
+             && vect_gather_scatter_fn_p (loop_vinfo,
+                                          STMT_VINFO_STRIDED_P (stmt_info),
+                                          DR_IS_READ (dr), masked_p,
+                                          vectype, memory_type,
                                           TREE_TYPE (off), scale, &ifn,
                                           &offset_vectype))
            break;
@@ -4280,9 +4313,11 @@ vect_check_gather_scatter (stmt_vec_info stmt_info, 
loop_vec_info loop_vinfo,
 
   if (use_ifn_p)
     {
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-                                    vectype, memory_type, offtype, scale,
-                                    &ifn, &offset_vectype))
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+                                    STMT_VINFO_STRIDED_P (stmt_info),
+                                    DR_IS_READ (dr), masked_p, vectype,
+                                    memory_type, offtype, scale, &ifn,
+                                    &offset_vectype))
        ifn = IFN_LAST;
       decl = NULL_TREE;
     }
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index a9200767f67..8ff06bd3acb 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -1506,10 +1506,15 @@ check_load_store_for_partial_vectors (loop_vec_info 
loop_vinfo, tree vectype,
       internal_fn len_ifn = (is_load
                             ? IFN_MASK_LEN_GATHER_LOAD
                             : IFN_MASK_LEN_SCATTER_STORE);
-      if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
-                                                 gs_info->memory_type,
-                                                 gs_info->offset_vectype,
-                                                 gs_info->scale))
+      if (internal_strided_fn_p (gs_info->ifn)
+         && internal_strided_fn_supported_p (gs_info->ifn, vectype,
+                                             TREE_TYPE (gs_info->offset),
+                                             gs_info->scale))
+       vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
+      else if (internal_gather_scatter_fn_supported_p (len_ifn, vectype,
+                                                      gs_info->memory_type,
+                                                      gs_info->offset_vectype,
+                                                      gs_info->scale))
        vect_record_loop_len (loop_vinfo, lens, nvectors, vectype, 1);
       else if (internal_gather_scatter_fn_supported_p (ifn, vectype,
                                                       gs_info->memory_type,
@@ -1693,8 +1698,10 @@ vect_truncate_gather_scatter_offset (stmt_vec_info 
stmt_info,
       /* See whether the target supports the operation with an offset
         no narrower than OFFSET_TYPE.  */
       tree memory_type = TREE_TYPE (DR_REF (dr));
-      if (!vect_gather_scatter_fn_p (loop_vinfo, DR_IS_READ (dr), masked_p,
-                                    vectype, memory_type, offset_type, scale,
+      if (!vect_gather_scatter_fn_p (loop_vinfo,
+                                    STMT_VINFO_STRIDED_P (stmt_info),
+                                    DR_IS_READ (dr), masked_p, vectype,
+                                    memory_type, offset_type, scale,
                                     &gs_info->ifn, &gs_info->offset_vectype)
          || gs_info->ifn == IFN_LAST)
        continue;
@@ -1734,6 +1741,15 @@ vect_use_strided_gather_scatters_p (stmt_vec_info 
stmt_info,
       || gs_info->ifn == IFN_LAST)
     return vect_truncate_gather_scatter_offset (stmt_info, loop_vinfo,
                                                masked_p, gs_info);
+  else if (internal_strided_fn_p (gs_info->ifn))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "using strided IFN for strided/grouped access,"
+                        " scale = %d\n",
+                        gs_info->scale);
+      return true;
+    }
 
   tree old_offset_type = TREE_TYPE (gs_info->offset);
   tree new_offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -3012,9 +3028,13 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
                          ssize_int (gs_info->scale));
   step = fold_convert (offset_type, step);
 
-  /* Create {0, X, X*2, X*3, ...}.  */
-  tree offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
-                            build_zero_cst (offset_type), step);
+  tree offset;
+  if (internal_strided_fn_p (gs_info->ifn))
+    offset = step;
+  else
+    /* Create {0, X, X*2, X*3, ...}.  */
+    offset = fold_build2 (VEC_SERIES_EXPR, gs_info->offset_vectype,
+                         build_zero_cst (offset_type), step);
   *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo, offset);
 }
 
@@ -9125,7 +9145,7 @@ vectorizable_store (vec_info *vinfo,
                vec_offset = vec_offsets[j];
              tree scale = size_int (gs_info.scale);
 
-             if (gs_info.ifn == IFN_MASK_LEN_SCATTER_STORE)
+             if (internal_fn_len_index (gs_info.ifn) >= 0)
                {
                  if (loop_lens)
                    final_len = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
@@ -9145,7 +9165,7 @@ vectorizable_store (vec_info *vinfo,
 
              gcall *call;
              if (final_len && final_mask)
-               call = gimple_build_call_internal (IFN_MASK_LEN_SCATTER_STORE,
+               call = gimple_build_call_internal (gs_info.ifn,
                                                   7, dataref_ptr, vec_offset,
                                                   scale, vec_oprnd, final_mask,
                                                   final_len, bias);
@@ -10949,7 +10969,7 @@ vectorizable_load (vec_info *vinfo,
                  tree zero = build_zero_cst (vectype);
                  tree scale = size_int (gs_info.scale);
 
-                 if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+                 if (internal_fn_len_index (gs_info.ifn) >= 0)
                    {
                      if (loop_lens)
                        final_len
@@ -10973,7 +10993,7 @@ vectorizable_load (vec_info *vinfo,
                  gcall *call;
                  if (final_len && final_mask)
                    call
-                     = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
+                     = gimple_build_call_internal (gs_info.ifn, 7,
                                                    dataref_ptr, vec_offset,
                                                    scale, zero, final_mask,
                                                    final_len, bias);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index a4043e4a656..76bf3aa14b4 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2309,7 +2309,8 @@ extern opt_result vect_analyze_data_refs_alignment 
(loop_vec_info);
 extern bool vect_slp_analyze_instance_alignment (vec_info *, slp_instance);
 extern opt_result vect_analyze_data_ref_accesses (vec_info *, vec<int> *);
 extern opt_result vect_prune_runtime_alias_test_list (loop_vec_info);
-extern bool vect_gather_scatter_fn_p (vec_info *, bool, bool, tree, tree,
+extern bool vect_gather_scatter_fn_p (vec_info *,
+                                     bool, bool, bool, tree, tree,
                                      tree, int, internal_fn *, tree *);
 extern bool vect_check_gather_scatter (stmt_vec_info, loop_vec_info,
                                       gather_scatter_info *);
-- 
2.36.3

Reply via email to