Re: [6/7] Explicitly classify vector loads and stores

Richard Biener Fri, 01 Jul 2016 02:58:45 -0700

On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford
<richard.sandif...@arm.com> wrote:
> This is the main patch in the series.  It adds a new enum and routines
> for classifying a vector load or store implementation.
>
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?


Why's the setting and checking of the memory access type conditional on !slp?
I'd rather avoid doing this :/

Otherwise it looks like a step in the right direction of splitting the
vectorizable_*
functions into a analysis part that records all decisions made and a transform
part that just applies it.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         * tree-vectorizer.h (vect_memory_access_type): New enum.
>         (_stmt_vec_info): Add a memory_access_type field.
>         (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro.
>         (vect_model_store_cost): Take an access type instead of a boolean.
>         (vect_model_load_cost): Likewise.
>         * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to
>         vect_model_store_cost and vect_model_load_cost.
>         * tree-vect-stmts.c (vec_load_store_type): New enum.
>         (vect_model_store_cost): Take an access type instead of a
>         store_lanes_p boolean.  Simplify tests.
>         (vect_model_load_cost): Likewise, but for load_lanes_p.
>         (get_group_load_store_type, get_load_store_type): New functions.
>         (vectorizable_store): Use get_load_store_type.  Record the access
>         type in STMT_VINFO_MEMORY_ACCESS_TYPE.
>         (vectorizable_load): Likewise.
>         (vectorizable_mask_load_store): Likewise.  Replace is_store
>         variable with vls_type.
>
> Index: gcc/tree-vectorizer.h
> ===================================================================
> --- gcc/tree-vectorizer.h
> +++ gcc/tree-vectorizer.h
> @@ -485,6 +485,33 @@ enum slp_vect_type {
>    hybrid
>  };
>
> +/* Describes how we're going to vectorize an individual load or store,
> +   or a group of loads or stores.  */
> +enum vect_memory_access_type {
> +  /* A simple contiguous access.  */
> +  VMAT_CONTIGUOUS,
> +
> +  /* A simple contiguous access in which the elements need to be permuted
> +     after loading or before storing.  Only used for loop vectorization;
> +     SLP uses separate permutes.  */
> +  VMAT_CONTIGUOUS_PERMUTE,
> +
> +  /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES.  */
> +  VMAT_LOAD_STORE_LANES,
> +
> +  /* An access in which each scalar element is loaded or stored
> +     individually.  */
> +  VMAT_ELEMENTWISE,
> +
> +  /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped
> +     SLP accesses.  Each unrolled iteration uses a contiguous load
> +     or store for the whole group, but the groups from separate iterations
> +     are combined in the same way as for VMAT_ELEMENTWISE.  */
> +  VMAT_STRIDED_SLP,
> +
> +  /* The access uses gather loads or scatter stores.  */
> +  VMAT_GATHER_SCATTER
> +};
>
>  typedef struct data_reference *dr_p;
>
> @@ -602,6 +629,10 @@ typedef struct _stmt_vec_info {
>    /* True if this is an access with loop-invariant stride.  */
>    bool strided_p;
>
> +  /* Classifies how the load or store is going to be implemented
> +     for loop vectorization.  */
> +  vect_memory_access_type memory_access_type;
> +
>    /* For both loads and stores.  */
>    bool simd_lane_access_p;
>
> @@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
>  #define STMT_VINFO_DATA_REF(S)             (S)->data_ref_info
>  #define STMT_VINFO_GATHER_SCATTER_P(S)    (S)->gather_scatter_p
>  #define STMT_VINFO_STRIDED_P(S)                   (S)->strided_p
> +#define STMT_VINFO_MEMORY_ACCESS_TYPE(S)   (S)->memory_access_type
>  #define STMT_VINFO_SIMD_LANE_ACCESS_P(S)   (S)->simd_lane_access_p
>  #define STMT_VINFO_VEC_REDUCTION_TYPE(S)   (S)->v_reduc_type
>
> @@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt);
>  extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *,
>                                      stmt_vector_for_cost *,
>                                     stmt_vector_for_cost *);
> -extern void vect_model_store_cost (stmt_vec_info, int, bool,
> +extern void vect_model_store_cost (stmt_vec_info, int, 
> vect_memory_access_type,
>                                    enum vect_def_type, slp_tree,
>                                    stmt_vector_for_cost *,
>                                    stmt_vector_for_cost *);
> -extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree,
> -                                 stmt_vector_for_cost *,
> +extern void vect_model_load_cost (stmt_vec_info, int, 
> vect_memory_access_type,
> +                                 slp_tree, stmt_vector_for_cost *,
>                                   stmt_vector_for_cost *);
>  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
>                                   enum vect_cost_for_stmt, stmt_vec_info,
> Index: gcc/tree-vect-slp.c
> ===================================================================
> --- gcc/tree-vect-slp.c
> +++ gcc/tree-vect-slp.c
> @@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance, 
> slp_tree node,
>    stmt_info = vinfo_for_stmt (stmt);
>    if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
>      {
> +      vect_memory_access_type memory_access_type
> +       = (STMT_VINFO_STRIDED_P (stmt_info)
> +          ? VMAT_STRIDED_SLP
> +          : VMAT_CONTIGUOUS);
>        if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
> -       vect_model_store_cost (stmt_info, ncopies_for_cost, false,
> -                              vect_uninitialized_def,
> +       vect_model_store_cost (stmt_info, ncopies_for_cost,
> +                              memory_access_type, vect_uninitialized_def,
>                                node, prologue_cost_vec, body_cost_vec);
>        else
>         {
> @@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance, 
> slp_tree node,
>               ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
>             }
>           /* Record the cost for the vector loads.  */
> -         vect_model_load_cost (stmt_info, ncopies_for_cost, false,
> -                               node, prologue_cost_vec, body_cost_vec);
> +         vect_model_load_cost (stmt_info, ncopies_for_cost,
> +                               memory_access_type, node, prologue_cost_vec,
> +                               body_cost_vec);
>           return;
>         }
>      }
> Index: gcc/tree-vect-stmts.c
> ===================================================================
> --- gcc/tree-vect-stmts.c
> +++ gcc/tree-vect-stmts.c
> @@ -52,6 +52,14 @@ along with GCC; see the file COPYING3.  If not see
>  /* For lang_hooks.types.type_for_mode.  */
>  #include "langhooks.h"
>
> +/* Says whether a statement is a load, a store of a vectorized statement
> +   result, or a store of an invariant value.  */
> +enum vec_load_store_type {
> +  VLS_LOAD,
> +  VLS_STORE,
> +  VLS_STORE_INVARIANT
> +};
> +
>  /* Return the vectorized type for the given statement.  */
>
>  tree
> @@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info 
> stmt_info,
>
>  void
>  vect_model_store_cost (stmt_vec_info stmt_info, int ncopies,
> -                      bool store_lanes_p, enum vect_def_type dt,
> -                      slp_tree slp_node,
> +                      vect_memory_access_type memory_access_type,
> +                      enum vect_def_type dt, slp_tree slp_node,
>                        stmt_vector_for_cost *prologue_cost_vec,
>                        stmt_vector_for_cost *body_cost_vec)
>  {
> @@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int 
> ncopies,
>    /* We assume that the cost of a single store-lanes instruction is
>       equivalent to the cost of GROUP_SIZE separate stores.  If a grouped
>       access is instead being provided by a permute-and-store operation,
> -     include the cost of the permutes.
> -
> -     For SLP, the caller has already counted the permutation, if any.  */
> -  if (grouped_access_p
> -      && first_stmt_p
> -      && !store_lanes_p
> -      && !STMT_VINFO_STRIDED_P (stmt_info)
> -      && !slp_node)
> +     include the cost of the permutes.  */
> +  if (first_stmt_p
> +      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
>      {
>        /* Uses a high and low interleave or shuffle operations for each
>          needed permute.  */
> @@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int 
> ncopies,
>
>    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
>    /* Costs of the stores.  */
> -  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> -    {
> -      /* N scalar stores plus extracting the elements.  */
> -      inside_cost += record_stmt_cost (body_cost_vec,
> -                                      ncopies * TYPE_VECTOR_SUBPARTS 
> (vectype),
> -                                      scalar_store, stmt_info, 0, vect_body);
> -    }
> +  if (memory_access_type == VMAT_ELEMENTWISE)
> +    /* N scalar stores plus extracting the elements.  */
> +    inside_cost += record_stmt_cost (body_cost_vec,
> +                                    ncopies * TYPE_VECTOR_SUBPARTS (vectype),
> +                                    scalar_store, stmt_info, 0, vect_body);
>    else
>      vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec);
>
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      inside_cost += record_stmt_cost (body_cost_vec,
>                                      ncopies * TYPE_VECTOR_SUBPARTS (vectype),
>                                      vec_to_scalar, stmt_info, 0, vect_body);
> @@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int 
> ncopies,
>
>  void
>  vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
> -                     bool load_lanes_p, slp_tree slp_node,
> +                     vect_memory_access_type memory_access_type,
> +                     slp_tree slp_node,
>                       stmt_vector_for_cost *prologue_cost_vec,
>                       stmt_vector_for_cost *body_cost_vec)
>  {
> @@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
> ncopies,
>    /* We assume that the cost of a single load-lanes instruction is
>       equivalent to the cost of GROUP_SIZE separate loads.  If a grouped
>       access is instead being provided by a load-and-permute operation,
> -     include the cost of the permutes.
> -
> -     For SLP, the caller has already counted the permutation, if any.  */
> -  if (grouped_access_p
> -      && first_stmt_p
> -      && !load_lanes_p
> -      && !STMT_VINFO_STRIDED_P (stmt_info)
> -      && !slp_node)
> +     include the cost of the permutes.  */
> +  if (first_stmt_p
> +      && memory_access_type == VMAT_CONTIGUOUS_PERMUTE)
>      {
>        /* Uses an even and odd extract operations or shuffle operations
>          for each needed permute.  */
> @@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
> ncopies,
>      }
>
>    /* The loads themselves.  */
> -  if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p))
> +  if (memory_access_type == VMAT_ELEMENTWISE)
>      {
>        /* N scalar loads plus gathering them into a vector.  */
>        tree vectype = STMT_VINFO_VECTYPE (stmt_info);
> @@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int 
> ncopies,
>      vect_get_load_cost (dr, ncopies, first_stmt_p,
>                         &inside_cost, &prologue_cost,
>                         prologue_cost_vec, body_cost_vec, true);
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct,
>                                      stmt_info, 0, vect_body);
>
> @@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree, 
> gimple *,
>                                   gimple_stmt_iterator *);
>
>
> +/* A subroutine of get_load_store_type, with a subset of the same
> +   arguments.  Handle the case where STMT is part of a grouped load
> +   or store.
> +
> +   For stores, the statements in the group are all consecutive
> +   and there is no gap at the end.  For loads, the statements in the
> +   group might not be consecutive; there can be gaps between statements
> +   as well as at the end.  */
> +
> +static bool
> +get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
> +                          vec_load_store_type vls_type,
> +                          vect_memory_access_type *memory_access_type)
> +{
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  vec_info *vinfo = stmt_info->vinfo;
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
> +  gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> +  unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> +  bool single_element_p = (stmt == first_stmt
> +                          && !GROUP_NEXT_ELEMENT (stmt_info));
> +  unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt));
> +  int nunits = TYPE_VECTOR_SUBPARTS (vectype);
> +
> +  /* True if the vectorized statements would access beyond the last
> +     statement in the group.  */
> +  bool overrun_p = false;
> +
> +  /* True if we can cope with such overrun by peeling for gaps, so that
> +     there is at least one final scalar iteration after the vector loop.  */
> +  bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner);
> +
> +  /* There can only be a gap at the end of the group if the stride is
> +     known at compile time.  */
> +  gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0);
> +
> +  /* Stores can't yet have gaps.  */
> +  gcc_assert (slp || vls_type == VLS_LOAD || gap == 0);
> +
> +  if (slp)
> +    {
> +      if (STMT_VINFO_STRIDED_P (stmt_info))
> +       {
> +         /* Try to use consecutive accesses of GROUP_SIZE elements,
> +            separated by the stride, until we have a complete vector.
> +            Fall back to scalar accesses if that isn't possible.  */
> +         if (nunits % group_size == 0)
> +           *memory_access_type = VMAT_STRIDED_SLP;
> +         else
> +           *memory_access_type = VMAT_ELEMENTWISE;
> +       }
> +      else
> +       {
> +         overrun_p = loop_vinfo && gap != 0;
> +         if (overrun_p && vls_type != VLS_LOAD)
> +           {
> +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                              "Grouped store with gaps requires"
> +                              " non-consecutive accesses\n");
> +             return false;
> +           }
> +         if (overrun_p && !can_overrun_p)
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "Peeling for outer loop is not supported\n");
> +             return false;
> +           }
> +         *memory_access_type = VMAT_CONTIGUOUS;
> +       }
> +    }
> +  else
> +    {
> +      /* We can always handle this case using elementwise accesses,
> +        but see if something more efficient is available.  */
> +      *memory_access_type = VMAT_ELEMENTWISE;
> +
> +      /* If there is a gap at the end of the group then these optimizations
> +        would access excess elements in the last iteration.  */
> +      bool would_overrun_p = (gap != 0);
> +      if (!STMT_VINFO_STRIDED_P (stmt_info)
> +         && (can_overrun_p || !would_overrun_p))
> +       {
> +         /* First try using LOAD/STORE_LANES.  */
> +         if (vls_type == VLS_LOAD
> +             ? vect_load_lanes_supported (vectype, group_size)
> +             : vect_store_lanes_supported (vectype, group_size))
> +           {
> +             *memory_access_type = VMAT_LOAD_STORE_LANES;
> +             overrun_p = would_overrun_p;
> +           }
> +
> +         /* If that fails, try using permuting loads.  */
> +         if (*memory_access_type == VMAT_ELEMENTWISE
> +             && (vls_type == VLS_LOAD
> +                 ? vect_grouped_load_supported (vectype, single_element_p,
> +                                                group_size)
> +                 : vect_grouped_store_supported (vectype, group_size)))
> +           {
> +             *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
> +             overrun_p = would_overrun_p;
> +           }
> +       }
> +    }
> +
> +  if (vls_type != VLS_LOAD && first_stmt == stmt)
> +    {
> +      /* STMT is the leader of the group. Check the operands of all the
> +        stmts of the group.  */
> +      gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> +      while (next_stmt)
> +       {
> +         gcc_assert (gimple_assign_single_p (next_stmt));
> +         tree op = gimple_assign_rhs1 (next_stmt);
> +         gimple *def_stmt;
> +         enum vect_def_type dt;
> +         if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> +           {
> +             if (dump_enabled_p ())
> +               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                "use not simple.\n");
> +             return false;
> +           }
> +         next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> +       }
> +    }
> +
> +  if (overrun_p)
> +    {
> +      gcc_assert (can_overrun_p);
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "Data access with gaps requires scalar "
> +                        "epilogue loop\n");
> +      LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> +    }
> +
> +  return true;
> +}
> +
> +/* Analyze load or store statement STMT of type VLS_TYPE.  Return true
> +   if there is a memory access type that the vectorized form can use,
> +   storing it in *MEMORY_ACCESS_TYPE if so.  If we decide to use gathers
> +   or scatters, fill in GS_INFO accordingly.
> +
> +   SLP says whether we're performing SLP rather than loop vectorization.
> +   VECTYPE is the vector type that the vectorized statements will use.  */
> +
> +static bool
> +get_load_store_type (gimple *stmt, tree vectype, bool slp,
> +                    vec_load_store_type vls_type,
> +                    vect_memory_access_type *memory_access_type,
> +                    gather_scatter_info *gs_info)
> +{
> +  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
> +  vec_info *vinfo = stmt_info->vinfo;
> +  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
> +  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +    {
> +      *memory_access_type = VMAT_GATHER_SCATTER;
> +      gimple *def_stmt;
> +      if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info))
> +       gcc_unreachable ();
> +      else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt,
> +                                   &gs_info->offset_dt,
> +                                   &gs_info->offset_vectype))
> +       {
> +         if (dump_enabled_p ())
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "%s index use not simple.\n",
> +                            vls_type == VLS_LOAD ? "gather" : "scatter");
> +         return false;
> +       }
> +    }
> +  else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> +    {
> +      if (!get_group_load_store_type (stmt, vectype, slp, vls_type,
> +                                     memory_access_type))
> +       return false;
> +    }
> +  else if (STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      gcc_assert (!slp);
> +      *memory_access_type = VMAT_ELEMENTWISE;
> +    }
> +  else
> +    *memory_access_type = VMAT_CONTIGUOUS;
> +
> +  /* FIXME: At the moment the cost model seems to underestimate the
> +     cost of using elementwise accesses.  This check preserves the
> +     traditional behavior until that can be fixed.  */
> +  if (*memory_access_type == VMAT_ELEMENTWISE
> +      && !STMT_VINFO_STRIDED_P (stmt_info))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "not falling back to elementwise accesses\n");
> +      return false;
> +    }
> +  return true;
> +}
> +
>  /* Function vectorizable_mask_load_store.
>
>     Check if STMT performs a conditional load or store that can be vectorized.
> @@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>    int i, j;
>    bool inv_p;
>    gather_scatter_info gs_info;
> -  bool is_store;
> +  vec_load_store_type vls_type;
>    tree mask;
>    gimple *def_stmt;
>    enum vect_def_type dt;
> @@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>    ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
>    gcc_assert (ncopies >= 1);
>
> -  is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE;
>    mask = gimple_call_arg (stmt, 2);
>
>    if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE)
> @@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>
>    elem_type = TREE_TYPE (vectype);
>
> -  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -    return false;
> -
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> -    return false;
> -
>    if (TREE_CODE (mask) != SSA_NAME)
>      return false;
>
> @@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>        || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS 
> (vectype))
>      return false;
>
> -  if (is_store)
> +  if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
>      {
>        tree rhs = gimple_call_arg (stmt, 3);
>        if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, 
> &rhs_vectype))
>         return false;
> +      if (dt == vect_constant_def || dt == vect_external_def)
> +       vls_type = VLS_STORE_INVARIANT;
> +      else
> +       vls_type = VLS_STORE;
>      }
> +  else
> +    vls_type = VLS_LOAD;
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "gather index use not simple.");
> -         return false;
> -       }
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, false, vls_type,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
> +    {
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
>        tree masktype
>         = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist))));
> @@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>           return false;
>         }
>      }
> +  else if (memory_access_type != VMAT_CONTIGUOUS)
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "unsupported access type for masked %s\n",
> +                        vls_type == VLS_LOAD ? "load" : "store");
> +      return false;
> +    }
>    else if (tree_int_cst_compare (nested_in_vect_loop
>                                  ? STMT_VINFO_DR_STEP (stmt_info)
>                                  : DR_STEP (dr), size_zero_node) <= 0)
> @@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
>            || !can_vec_mask_load_store_p (TYPE_MODE (vectype),
>                                           TYPE_MODE (mask_vectype),
> -                                         !is_store)
> +                                         vls_type == VLS_LOAD)
>            || (rhs_vectype
>                && !useless_type_conversion_p (vectype, rhs_vectype)))
>      return false;
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = call_vec_info_type;
> -      if (is_store)
> -       vect_model_store_cost (stmt_info, ncopies, false, dt,
> -                              NULL, NULL, NULL);
> +      if (vls_type == VLS_LOAD)
> +       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
> +                             NULL, NULL, NULL);
>        else
> -       vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL);
> +       vect_model_store_cost (stmt_info, ncopies, memory_access_type,
> +                              dt, NULL, NULL, NULL);
>        return true;
>      }
> +  gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE 
> (stmt_info));
>
>    /** Transform.  **/
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, op;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>        gsi_replace (gsi, new_stmt, true);
>        return true;
>      }
> -  else if (is_store)
> +  else if (vls_type != VLS_LOAD)
>      {
>        tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE;
>        prev_stmt_info = NULL;
> @@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt, 
> gimple_stmt_iterator *gsi,
>         }
>      }
>
> -  if (!is_store)
> +  if (vls_type == VLS_LOAD)
>      {
>        /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed
>          from the IL.  */
> @@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>    gimple *ptr_incr = NULL;
>    int ncopies;
>    int j;
> -  gimple *next_stmt, *first_stmt = NULL;
> -  bool grouped_store = false;
> -  bool store_lanes_p = false;
> +  gimple *next_stmt, *first_stmt;
> +  bool grouped_store;
>    unsigned int group_size, i;
>    vec<tree> dr_chain = vNULL;
>    vec<tree> oprnds = vNULL;
> @@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>    gather_scatter_info gs_info;
>    enum vect_def_type scatter_src_dt = vect_unknown_def_type;
>    gimple *new_stmt;
> +  vec_load_store_type vls_type;
>
>    if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo)
>      return false;
> @@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>        return false;
>      }
>
> +  if (dt == vect_constant_def || dt == vect_external_def)
> +    vls_type = VLS_STORE_INVARIANT;
> +  else
> +    vls_type = VLS_STORE;
> +
>    if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype))
>      return false;
>
> @@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>         }
>        if (negative)
>         {
> -         gcc_assert (!grouped_store);
>           alignment_support_scheme = vect_supportable_dr_alignment (dr, 
> false);
>           if (alignment_support_scheme != dr_aligned
>               && alignment_support_scheme != dr_unaligned_supported)
> @@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt, 
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
>         }
>      }
>
> -  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -    {
> -      grouped_store = true;
> -      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> -      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> -      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> -       {
> -         if (vect_store_lanes_supported (vectype, group_size))
> -           store_lanes_p = true;
> -         else if (!vect_grouped_store_supported (vectype, group_size))
> -           return false;
> -       }
> -
> -      if (STMT_VINFO_STRIDED_P (stmt_info)
> -         && slp
> -         && (group_size > nunits
> -             || nunits % group_size != 0))
> -       {
> -         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                          "unhandled strided group store\n");
> -         return false;
> -       }
> -
> -      if (first_stmt == stmt)
> -       {
> -          /* STMT is the leader of the group. Check the operands of all the
> -             stmts of the group.  */
> -          next_stmt = GROUP_NEXT_ELEMENT (stmt_info);
> -          while (next_stmt)
> -            {
> -             gcc_assert (gimple_assign_single_p (next_stmt));
> -             op = gimple_assign_rhs1 (next_stmt);
> -              if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt))
> -                {
> -                  if (dump_enabled_p ())
> -                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                     "use not simple.\n");
> -                  return false;
> -                }
> -              next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
> -            }
> -        }
> -    }
> -
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                             "scatter index use not simple.");
> -         return false;
> -       }
> -    }
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, slp, vls_type,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      if (!slp)
> +       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = store_vec_info_type;
>        /* The SLP costs are calculated during SLP analysis.  */
>        if (!PURE_SLP_STMT (stmt_info))
> -       vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt,
> +       vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt,
>                                NULL, NULL, NULL);
>        return true;
>      }
> +  if (!slp)
> +    gcc_assert (memory_access_type
> +               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
>
>    /** Transform.  **/
>
>    ensure_base_align (stmt_info, dr);
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>        return true;
>      }
>
> +  grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info);
>    if (grouped_store)
>      {
> +      first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
>        first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
>        group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
>
> @@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "transform store. ncopies = %d\n", ncopies);
>
> -  if (STMT_VINFO_STRIDED_P (stmt_info))
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      {
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> @@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt, 
> gimple_stmt_iterator *gsi, gimple **vec_stmt,
>    gcc_assert (alignment_support_scheme);
>    /* Targets with store-lane instructions must not require explicit
>       realignment.  */
> -  gcc_assert (!store_lanes_p
> +  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
>               || alignment_support_scheme == dr_aligned
>               || alignment_support_scheme == dr_unaligned_supported);
>
>    if (negative)
>      offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> -  if (store_lanes_p)
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>    else
>      aggr_type = vectype;
> @@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>                                            TYPE_SIZE_UNIT (aggr_type));
>         }
>
> -      if (store_lanes_p)
> +      if (memory_access_type == VMAT_LOAD_STORE_LANES)
>         {
>           tree vec_array;
>
> @@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>    gphi *phi = NULL;
>    vec<tree> dr_chain = vNULL;
>    bool grouped_load = false;
> -  bool load_lanes_p = false;
>    gimple *first_stmt;
>    gimple *first_stmt_for_drptr = NULL;
>    bool inv_p;
> @@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>      {
>        grouped_load = true;
>        /* FORNOW */
> -      gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P 
> (stmt_info));
> +      gcc_assert (!nested_in_vect_loop);
> +      gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info));
>
>        first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
>        group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
> -      bool single_element_p = (first_stmt == stmt
> -                              && !GROUP_NEXT_ELEMENT (stmt_info));
> -
> -      if (!slp && !STMT_VINFO_STRIDED_P (stmt_info))
> -       {
> -         if (vect_load_lanes_supported (vectype, group_size))
> -           load_lanes_p = true;
> -         else if (!vect_grouped_load_supported (vectype, single_element_p,
> -                                                group_size))
> -           return false;
> -       }
> -
> -      if (single_element_p)
> -       {
> -         /* Single-element interleaving requires peeling for gaps.  */
> -         gcc_assert (GROUP_GAP (stmt_info));
> -       }
> -
> -      /* If there is a gap in the end of the group then we access excess
> -        elements in the last iteration and thus need to peel that off.  */
> -      if (loop_vinfo
> -         && ! STMT_VINFO_STRIDED_P (stmt_info)
> -         && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0)
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                            "Data access with gaps requires scalar "
> -                            "epilogue loop\n");
> -         if (loop->inner)
> -           {
> -             if (dump_enabled_p ())
> -               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                                "Peeling for outer loop is not supported\n");
> -             return false;
> -           }
> -
> -         LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
> -       }
>
>        if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
>         slp_perm = true;
> @@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>         }
>      }
>
> +  vect_memory_access_type memory_access_type;
> +  if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD,
> +                           &memory_access_type, &gs_info))
> +    return false;
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> -    {
> -      gimple *def_stmt;
> -      if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info))
> -       gcc_unreachable ();
> -      if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt,
> -                              &gs_info.offset_dt, &gs_info.offset_vectype))
> -       {
> -         if (dump_enabled_p ())
> -           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> -                             "gather index use not simple.\n");
> -         return false;
> -       }
> -    }
> -  else if (STMT_VINFO_STRIDED_P (stmt_info))
> -    ;
> -  else
> +  if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)
> +      && !STMT_VINFO_STRIDED_P (stmt_info))
>      {
>        negative = tree_int_cst_compare (nested_in_vect_loop
>                                        ? STMT_VINFO_DR_STEP (stmt_info)
> @@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>
>    if (!vec_stmt) /* transformation not required.  */
>      {
> +      if (!slp)
> +       STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type;
>        STMT_VINFO_TYPE (stmt_info) = load_vec_info_type;
>        /* The SLP costs are calculated during SLP analysis.  */
>        if (!PURE_SLP_STMT (stmt_info))
> -       vect_model_load_cost (stmt_info, ncopies, load_lanes_p,
> +       vect_model_load_cost (stmt_info, ncopies, memory_access_type,
>                               NULL, NULL, NULL);
>        return true;
>      }
>
> +  if (!slp)
> +    gcc_assert (memory_access_type
> +               == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info));
> +
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
>                       "transform load. ncopies = %d\n", ncopies);
> @@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>
>    ensure_base_align (stmt_info, dr);
>
> -  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
> +  if (memory_access_type == VMAT_GATHER_SCATTER)
>      {
>        tree vec_oprnd0 = NULL_TREE, op;
>        tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl));
> @@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>         }
>        return true;
>      }
> -  else if (STMT_VINFO_STRIDED_P (stmt_info))
> +
> +  if (memory_access_type == VMAT_ELEMENTWISE
> +      || memory_access_type == VMAT_STRIDED_SLP)
>      {
>        gimple_stmt_iterator incr_gsi;
>        bool insert_after;
> @@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>        int lnel = 1;
>        tree ltype = TREE_TYPE (vectype);
>        auto_vec<tree> dr_chain;
> -      if (slp)
> +      if (memory_access_type == VMAT_STRIDED_SLP)
>         {
> -         if (group_size < nunits
> -             && nunits % group_size == 0)
> +         nloads = nunits / group_size;
> +         if (group_size < nunits)
>             {
> -             nloads = nunits / group_size;
>               lnel = group_size;
>               ltype = build_vector_type (TREE_TYPE (vectype), group_size);
> -             ltype = build_aligned_type (ltype,
> -                                         TYPE_ALIGN (TREE_TYPE (vectype)));
>             }
> -         else if (group_size >= nunits
> -                  && group_size % nunits == 0)
> +         else
>             {
> -             nloads = 1;
>               lnel = nunits;
>               ltype = vectype;
> -             ltype = build_aligned_type (ltype,
> -                                         TYPE_ALIGN (TREE_TYPE (vectype)));
>             }
> +         ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE 
> (vectype)));
> +       }
> +      if (slp)
> +       {
>           /* For SLP permutation support we need to load the whole group,
>              not only the number of vector stmts the permutation result
>              fits in.  */
> @@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>    gcc_assert (alignment_support_scheme);
>    /* Targets with load-lane instructions must not require explicit
>       realignment.  */
> -  gcc_assert (!load_lanes_p
> +  gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES
>               || alignment_support_scheme == dr_aligned
>               || alignment_support_scheme == dr_unaligned_supported);
>
> @@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>    if (negative)
>      offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1);
>
> -  if (load_lanes_p)
> +  if (memory_access_type == VMAT_LOAD_STORE_LANES)
>      aggr_type = build_array_type_nelts (elem_type, vec_num * nunits);
>    else
>      aggr_type = vectype;
> @@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>        if (grouped_load || slp_perm)
>         dr_chain.create (vec_num);
>
> -      if (load_lanes_p)
> +      if (memory_access_type == VMAT_LOAD_STORE_LANES)
>         {
>           tree vec_array;
>
> @@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator 
> *gsi, gimple **vec_stmt,
>          {
>            if (grouped_load)
>             {
> -             if (!load_lanes_p)
> +             if (memory_access_type != VMAT_LOAD_STORE_LANES)
>                 vect_transform_grouped_load (stmt, dr_chain, group_size, gsi);
>               *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
>             }

Re: [6/7] Explicitly classify vector loads and stores

Reply via email to