From: Pan Li <pan2...@intel.com>

This patch would like to allow generation of MASK_LEN_STRIDED_LOAD{STORE} IR
for invariant stride memory access.  For example as below

void foo (int * __restrict a, int * __restrict b, int stride, int n)
{
    for (int i = 0; i < n; i++)
      a[i*stride] = b[i*stride] + 100;
}

Before this patch:
  66   │   _73 = .SELECT_VL (ivtmp_71, POLY_INT_CST [4, 4]);
  67   │   _52 = _54 * _73;
  68   │   vect__5.16_61 = .MASK_LEN_GATHER_LOAD (vectp_b.14_59, _58, 4, { 0, 
... }, { -1, ... }, _73, 0);
  69   │   vect__7.17_63 = vect__5.16_61 + { 100, ... };
  70   │   .MASK_LEN_SCATTER_STORE (vectp_a.18_67, _58, 4, vect__7.17_63, { -1, 
... }, _73, 0);
  71   │   vectp_b.14_60 = vectp_b.14_59 + _52;
  72   │   vectp_a.18_68 = vectp_a.18_67 + _52;
  73   │   ivtmp_72 = ivtmp_71 - _73;

After this patch:
  60   │   _70 = .SELECT_VL (ivtmp_68, POLY_INT_CST [4, 4]);
  61   │   _52 = _54 * _70;
  62   │   vect__5.16_58 = .MASK_LEN_STRIDED_LOAD (vectp_b.14_56, _55, { 0, ... 
}, { -1, ... }, _70, 0);
  63   │   vect__7.17_60 = vect__5.16_58 + { 100, ... };
  64   │   .MASK_LEN_STRIDED_STORE (vectp_a.18_64, _55, vect__7.17_60, { -1, 
... }, _70, 0);
  65   │   vectp_b.14_57 = vectp_b.14_56 + _52;
  66   │   vectp_a.18_65 = vectp_a.18_64 + _52;
  67   │   ivtmp_69 = ivtmp_68 - _70;

The below test suites are passed for this patch:
* The x86 bootstrap test.
* The x86 fully regression test.
* The riscv fully regression test.

gcc/ChangeLog:

        * tree-vect-stmts.cc (vect_get_strided_load_store_ops): Handle
        MASK_LEN_STRIDED_LOAD{STORE} after supported check.
        (vectorizable_store): Generate MASK_LEN_STRIDED_LOAD when the offset
        of gater is not vector type.
        (vectorizable_load): Ditto but for store.

Signed-off-by: Pan Li <pan2...@intel.com>
Co-Authored-By: Juzhe-Zhong <juzhe.zh...@rivai.ai>
---
 gcc/tree-vect-stmts.cc | 45 +++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index e7f14c3144c..78d66a4ef9d 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2950,6 +2950,15 @@ vect_get_strided_load_store_ops (stmt_vec_info stmt_info,
       *dataref_bump = cse_and_gimplify_to_preheader (loop_vinfo, bump);
     }
 
+  internal_fn ifn
+    = DR_IS_READ (dr) ? IFN_MASK_LEN_STRIDED_LOAD : IFN_MASK_LEN_STRIDED_STORE;
+  if (direct_internal_fn_supported_p (ifn, vectype, OPTIMIZE_FOR_SPEED))
+    {
+      *vec_offset = cse_and_gimplify_to_preheader (loop_vinfo,
+                                                  unshare_expr (DR_STEP (dr)));
+      return;
+    }
+
   /* The offset given in GS_INFO can have pointer type, so use the element
      type of the vector instead.  */
   tree offset_type = TREE_TYPE (gs_info->offset_vectype);
@@ -9194,10 +9203,20 @@ vectorizable_store (vec_info *vinfo,
 
                  gcall *call;
                  if (final_len && final_mask)
-                   call = gimple_build_call_internal
-                            (IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
-                             vec_offset, scale, vec_oprnd, final_mask,
-                             final_len, bias);
+                   {
+                     if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+                       call = gimple_build_call_internal (
+                         IFN_MASK_LEN_SCATTER_STORE, 7, dataref_ptr,
+                         vec_offset, scale, vec_oprnd, final_mask, final_len,
+                         bias);
+                     else
+                       /* Non-vector offset indicates that prefer to take
+                          MASK_LEN_STRIDED_STORE instead of the
+                          IFN_MASK_SCATTER_STORE with direct stride arg.  */
+                       call = gimple_build_call_internal (
+                         IFN_MASK_LEN_STRIDED_STORE, 6, dataref_ptr,
+                         vec_offset, vec_oprnd, final_mask, final_len, bias);
+                   }
                  else if (final_mask)
                    call = gimple_build_call_internal
                             (IFN_MASK_SCATTER_STORE, 5, dataref_ptr,
@@ -11194,11 +11213,19 @@ vectorizable_load (vec_info *vinfo,
 
                  gcall *call;
                  if (final_len && final_mask)
-                   call
-                     = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
-                                                   dataref_ptr, vec_offset,
-                                                   scale, zero, final_mask,
-                                                   final_len, bias);
+                   {
+                     if (VECTOR_TYPE_P (TREE_TYPE (vec_offset)))
+                       call = gimple_build_call_internal (
+                         IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
+                         scale, zero, final_mask, final_len, bias);
+                     else
+                       /* Non-vector offset indicates that prefer to take
+                          MASK_LEN_STRIDED_LOAD instead of the
+                          MASK_LEN_GATHER_LOAD with direct stride arg.  */
+                       call = gimple_build_call_internal (
+                         IFN_MASK_LEN_STRIDED_LOAD, 6, dataref_ptr, vec_offset,
+                         zero, final_mask, final_len, bias);
+                   }
                  else if (final_mask)
                    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
                                                       dataref_ptr, vec_offset,
-- 
2.43.0

Reply via email to