On Wed, Jun 15, 2016 at 10:52 AM, Richard Sandiford <richard.sandif...@arm.com> wrote: > This is the main patch in the series. It adds a new enum and routines > for classifying a vector load or store implementation. > > Tested on aarch64-linux-gnu and x86_64-linux-gnu. OK to install?
Why's the setting and checking of the memory access type conditional on !slp? I'd rather avoid doing this :/ Otherwise it looks like a step in the right direction of splitting the vectorizable_* functions into a analysis part that records all decisions made and a transform part that just applies it. Thanks, Richard. > Thanks, > Richard > > > gcc/ > * tree-vectorizer.h (vect_memory_access_type): New enum. > (_stmt_vec_info): Add a memory_access_type field. > (STMT_VINFO_MEMORY_ACCESS_TYPE): New macro. > (vect_model_store_cost): Take an access type instead of a boolean. > (vect_model_load_cost): Likewise. > * tree-vect-slp.c (vect_analyze_slp_cost_1): Update calls to > vect_model_store_cost and vect_model_load_cost. > * tree-vect-stmts.c (vec_load_store_type): New enum. > (vect_model_store_cost): Take an access type instead of a > store_lanes_p boolean. Simplify tests. > (vect_model_load_cost): Likewise, but for load_lanes_p. > (get_group_load_store_type, get_load_store_type): New functions. > (vectorizable_store): Use get_load_store_type. Record the access > type in STMT_VINFO_MEMORY_ACCESS_TYPE. > (vectorizable_load): Likewise. > (vectorizable_mask_load_store): Likewise. Replace is_store > variable with vls_type. > > Index: gcc/tree-vectorizer.h > =================================================================== > --- gcc/tree-vectorizer.h > +++ gcc/tree-vectorizer.h > @@ -485,6 +485,33 @@ enum slp_vect_type { > hybrid > }; > > +/* Describes how we're going to vectorize an individual load or store, > + or a group of loads or stores. */ > +enum vect_memory_access_type { > + /* A simple contiguous access. */ > + VMAT_CONTIGUOUS, > + > + /* A simple contiguous access in which the elements need to be permuted > + after loading or before storing. Only used for loop vectorization; > + SLP uses separate permutes. */ > + VMAT_CONTIGUOUS_PERMUTE, > + > + /* An access that uses IFN_LOAD_LANES or IFN_STORE_LANES. */ > + VMAT_LOAD_STORE_LANES, > + > + /* An access in which each scalar element is loaded or stored > + individually. */ > + VMAT_ELEMENTWISE, > + > + /* A hybrid of VMAT_CONTIGUOUS and VMAT_ELEMENTWISE, used for grouped > + SLP accesses. Each unrolled iteration uses a contiguous load > + or store for the whole group, but the groups from separate iterations > + are combined in the same way as for VMAT_ELEMENTWISE. */ > + VMAT_STRIDED_SLP, > + > + /* The access uses gather loads or scatter stores. */ > + VMAT_GATHER_SCATTER > +}; > > typedef struct data_reference *dr_p; > > @@ -602,6 +629,10 @@ typedef struct _stmt_vec_info { > /* True if this is an access with loop-invariant stride. */ > bool strided_p; > > + /* Classifies how the load or store is going to be implemented > + for loop vectorization. */ > + vect_memory_access_type memory_access_type; > + > /* For both loads and stores. */ > bool simd_lane_access_p; > > @@ -659,6 +690,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) > #define STMT_VINFO_DATA_REF(S) (S)->data_ref_info > #define STMT_VINFO_GATHER_SCATTER_P(S) (S)->gather_scatter_p > #define STMT_VINFO_STRIDED_P(S) (S)->strided_p > +#define STMT_VINFO_MEMORY_ACCESS_TYPE(S) (S)->memory_access_type > #define STMT_VINFO_SIMD_LANE_ACCESS_P(S) (S)->simd_lane_access_p > #define STMT_VINFO_VEC_REDUCTION_TYPE(S) (S)->v_reduc_type > > @@ -1006,12 +1038,12 @@ extern void free_stmt_vec_info (gimple *stmt); > extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *, > stmt_vector_for_cost *, > stmt_vector_for_cost *); > -extern void vect_model_store_cost (stmt_vec_info, int, bool, > +extern void vect_model_store_cost (stmt_vec_info, int, > vect_memory_access_type, > enum vect_def_type, slp_tree, > stmt_vector_for_cost *, > stmt_vector_for_cost *); > -extern void vect_model_load_cost (stmt_vec_info, int, bool, slp_tree, > - stmt_vector_for_cost *, > +extern void vect_model_load_cost (stmt_vec_info, int, > vect_memory_access_type, > + slp_tree, stmt_vector_for_cost *, > stmt_vector_for_cost *); > extern unsigned record_stmt_cost (stmt_vector_for_cost *, int, > enum vect_cost_for_stmt, stmt_vec_info, > Index: gcc/tree-vect-slp.c > =================================================================== > --- gcc/tree-vect-slp.c > +++ gcc/tree-vect-slp.c > @@ -1490,9 +1490,13 @@ vect_analyze_slp_cost_1 (slp_instance instance, > slp_tree node, > stmt_info = vinfo_for_stmt (stmt); > if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > { > + vect_memory_access_type memory_access_type > + = (STMT_VINFO_STRIDED_P (stmt_info) > + ? VMAT_STRIDED_SLP > + : VMAT_CONTIGUOUS); > if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info))) > - vect_model_store_cost (stmt_info, ncopies_for_cost, false, > - vect_uninitialized_def, > + vect_model_store_cost (stmt_info, ncopies_for_cost, > + memory_access_type, vect_uninitialized_def, > node, prologue_cost_vec, body_cost_vec); > else > { > @@ -1515,8 +1519,9 @@ vect_analyze_slp_cost_1 (slp_instance instance, > slp_tree node, > ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); > } > /* Record the cost for the vector loads. */ > - vect_model_load_cost (stmt_info, ncopies_for_cost, false, > - node, prologue_cost_vec, body_cost_vec); > + vect_model_load_cost (stmt_info, ncopies_for_cost, > + memory_access_type, node, prologue_cost_vec, > + body_cost_vec); > return; > } > } > Index: gcc/tree-vect-stmts.c > =================================================================== > --- gcc/tree-vect-stmts.c > +++ gcc/tree-vect-stmts.c > @@ -52,6 +52,14 @@ along with GCC; see the file COPYING3. If not see > /* For lang_hooks.types.type_for_mode. */ > #include "langhooks.h" > > +/* Says whether a statement is a load, a store of a vectorized statement > + result, or a store of an invariant value. */ > +enum vec_load_store_type { > + VLS_LOAD, > + VLS_STORE, > + VLS_STORE_INVARIANT > +}; > + > /* Return the vectorized type for the given statement. */ > > tree > @@ -873,8 +881,8 @@ vect_model_promotion_demotion_cost (stmt_vec_info > stmt_info, > > void > vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, > - bool store_lanes_p, enum vect_def_type dt, > - slp_tree slp_node, > + vect_memory_access_type memory_access_type, > + enum vect_def_type dt, slp_tree slp_node, > stmt_vector_for_cost *prologue_cost_vec, > stmt_vector_for_cost *body_cost_vec) > { > @@ -903,14 +911,9 @@ vect_model_store_cost (stmt_vec_info stmt_info, int > ncopies, > /* We assume that the cost of a single store-lanes instruction is > equivalent to the cost of GROUP_SIZE separate stores. If a grouped > access is instead being provided by a permute-and-store operation, > - include the cost of the permutes. > - > - For SLP, the caller has already counted the permutation, if any. */ > - if (grouped_access_p > - && first_stmt_p > - && !store_lanes_p > - && !STMT_VINFO_STRIDED_P (stmt_info) > - && !slp_node) > + include the cost of the permutes. */ > + if (first_stmt_p > + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE) > { > /* Uses a high and low interleave or shuffle operations for each > needed permute. */ > @@ -927,17 +930,16 @@ vect_model_store_cost (stmt_vec_info stmt_info, int > ncopies, > > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > /* Costs of the stores. */ > - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p)) > - { > - /* N scalar stores plus extracting the elements. */ > - inside_cost += record_stmt_cost (body_cost_vec, > - ncopies * TYPE_VECTOR_SUBPARTS > (vectype), > - scalar_store, stmt_info, 0, vect_body); > - } > + if (memory_access_type == VMAT_ELEMENTWISE) > + /* N scalar stores plus extracting the elements. */ > + inside_cost += record_stmt_cost (body_cost_vec, > + ncopies * TYPE_VECTOR_SUBPARTS (vectype), > + scalar_store, stmt_info, 0, vect_body); > else > vect_get_store_cost (dr, ncopies, &inside_cost, body_cost_vec); > > - if (STMT_VINFO_STRIDED_P (stmt_info)) > + if (memory_access_type == VMAT_ELEMENTWISE > + || memory_access_type == VMAT_STRIDED_SLP) > inside_cost += record_stmt_cost (body_cost_vec, > ncopies * TYPE_VECTOR_SUBPARTS (vectype), > vec_to_scalar, stmt_info, 0, vect_body); > @@ -1011,7 +1013,8 @@ vect_get_store_cost (struct data_reference *dr, int > ncopies, > > void > vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, > - bool load_lanes_p, slp_tree slp_node, > + vect_memory_access_type memory_access_type, > + slp_tree slp_node, > stmt_vector_for_cost *prologue_cost_vec, > stmt_vector_for_cost *body_cost_vec) > { > @@ -1036,14 +1039,9 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > ncopies, > /* We assume that the cost of a single load-lanes instruction is > equivalent to the cost of GROUP_SIZE separate loads. If a grouped > access is instead being provided by a load-and-permute operation, > - include the cost of the permutes. > - > - For SLP, the caller has already counted the permutation, if any. */ > - if (grouped_access_p > - && first_stmt_p > - && !load_lanes_p > - && !STMT_VINFO_STRIDED_P (stmt_info) > - && !slp_node) > + include the cost of the permutes. */ > + if (first_stmt_p > + && memory_access_type == VMAT_CONTIGUOUS_PERMUTE) > { > /* Uses an even and odd extract operations or shuffle operations > for each needed permute. */ > @@ -1059,7 +1057,7 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > ncopies, > } > > /* The loads themselves. */ > - if (STMT_VINFO_STRIDED_P (stmt_info) && !(slp_node && grouped_access_p)) > + if (memory_access_type == VMAT_ELEMENTWISE) > { > /* N scalar loads plus gathering them into a vector. */ > tree vectype = STMT_VINFO_VECTYPE (stmt_info); > @@ -1071,7 +1069,8 @@ vect_model_load_cost (stmt_vec_info stmt_info, int > ncopies, > vect_get_load_cost (dr, ncopies, first_stmt_p, > &inside_cost, &prologue_cost, > prologue_cost_vec, body_cost_vec, true); > - if (STMT_VINFO_STRIDED_P (stmt_info)) > + if (memory_access_type == VMAT_ELEMENTWISE > + || memory_access_type == VMAT_STRIDED_SLP) > inside_cost += record_stmt_cost (body_cost_vec, ncopies, vec_construct, > stmt_info, 0, vect_body); > > @@ -1674,6 +1673,209 @@ static tree permute_vec_elements (tree, tree, tree, > gimple *, > gimple_stmt_iterator *); > > > +/* A subroutine of get_load_store_type, with a subset of the same > + arguments. Handle the case where STMT is part of a grouped load > + or store. > + > + For stores, the statements in the group are all consecutive > + and there is no gap at the end. For loads, the statements in the > + group might not be consecutive; there can be gaps between statements > + as well as at the end. */ > + > +static bool > +get_group_load_store_type (gimple *stmt, tree vectype, bool slp, > + vec_load_store_type vls_type, > + vect_memory_access_type *memory_access_type) > +{ > + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + vec_info *vinfo = stmt_info->vinfo; > + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); > + struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL; > + gimple *first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > + unsigned int group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); > + bool single_element_p = (stmt == first_stmt > + && !GROUP_NEXT_ELEMENT (stmt_info)); > + unsigned HOST_WIDE_INT gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); > + int nunits = TYPE_VECTOR_SUBPARTS (vectype); > + > + /* True if the vectorized statements would access beyond the last > + statement in the group. */ > + bool overrun_p = false; > + > + /* True if we can cope with such overrun by peeling for gaps, so that > + there is at least one final scalar iteration after the vector loop. */ > + bool can_overrun_p = (vls_type == VLS_LOAD && loop_vinfo && !loop->inner); > + > + /* There can only be a gap at the end of the group if the stride is > + known at compile time. */ > + gcc_assert (!STMT_VINFO_STRIDED_P (stmt_info) || gap == 0); > + > + /* Stores can't yet have gaps. */ > + gcc_assert (slp || vls_type == VLS_LOAD || gap == 0); > + > + if (slp) > + { > + if (STMT_VINFO_STRIDED_P (stmt_info)) > + { > + /* Try to use consecutive accesses of GROUP_SIZE elements, > + separated by the stride, until we have a complete vector. > + Fall back to scalar accesses if that isn't possible. */ > + if (nunits % group_size == 0) > + *memory_access_type = VMAT_STRIDED_SLP; > + else > + *memory_access_type = VMAT_ELEMENTWISE; > + } > + else > + { > + overrun_p = loop_vinfo && gap != 0; > + if (overrun_p && vls_type != VLS_LOAD) > + { > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Grouped store with gaps requires" > + " non-consecutive accesses\n"); > + return false; > + } > + if (overrun_p && !can_overrun_p) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Peeling for outer loop is not supported\n"); > + return false; > + } > + *memory_access_type = VMAT_CONTIGUOUS; > + } > + } > + else > + { > + /* We can always handle this case using elementwise accesses, > + but see if something more efficient is available. */ > + *memory_access_type = VMAT_ELEMENTWISE; > + > + /* If there is a gap at the end of the group then these optimizations > + would access excess elements in the last iteration. */ > + bool would_overrun_p = (gap != 0); > + if (!STMT_VINFO_STRIDED_P (stmt_info) > + && (can_overrun_p || !would_overrun_p)) > + { > + /* First try using LOAD/STORE_LANES. */ > + if (vls_type == VLS_LOAD > + ? vect_load_lanes_supported (vectype, group_size) > + : vect_store_lanes_supported (vectype, group_size)) > + { > + *memory_access_type = VMAT_LOAD_STORE_LANES; > + overrun_p = would_overrun_p; > + } > + > + /* If that fails, try using permuting loads. */ > + if (*memory_access_type == VMAT_ELEMENTWISE > + && (vls_type == VLS_LOAD > + ? vect_grouped_load_supported (vectype, single_element_p, > + group_size) > + : vect_grouped_store_supported (vectype, group_size))) > + { > + *memory_access_type = VMAT_CONTIGUOUS_PERMUTE; > + overrun_p = would_overrun_p; > + } > + } > + } > + > + if (vls_type != VLS_LOAD && first_stmt == stmt) > + { > + /* STMT is the leader of the group. Check the operands of all the > + stmts of the group. */ > + gimple *next_stmt = GROUP_NEXT_ELEMENT (stmt_info); > + while (next_stmt) > + { > + gcc_assert (gimple_assign_single_p (next_stmt)); > + tree op = gimple_assign_rhs1 (next_stmt); > + gimple *def_stmt; > + enum vect_def_type dt; > + if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "use not simple.\n"); > + return false; > + } > + next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); > + } > + } > + > + if (overrun_p) > + { > + gcc_assert (can_overrun_p); > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "Data access with gaps requires scalar " > + "epilogue loop\n"); > + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; > + } > + > + return true; > +} > + > +/* Analyze load or store statement STMT of type VLS_TYPE. Return true > + if there is a memory access type that the vectorized form can use, > + storing it in *MEMORY_ACCESS_TYPE if so. If we decide to use gathers > + or scatters, fill in GS_INFO accordingly. > + > + SLP says whether we're performing SLP rather than loop vectorization. > + VECTYPE is the vector type that the vectorized statements will use. */ > + > +static bool > +get_load_store_type (gimple *stmt, tree vectype, bool slp, > + vec_load_store_type vls_type, > + vect_memory_access_type *memory_access_type, > + gather_scatter_info *gs_info) > +{ > + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); > + vec_info *vinfo = stmt_info->vinfo; > + loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); > + if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + { > + *memory_access_type = VMAT_GATHER_SCATTER; > + gimple *def_stmt; > + if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)) > + gcc_unreachable (); > + else if (!vect_is_simple_use (gs_info->offset, vinfo, &def_stmt, > + &gs_info->offset_dt, > + &gs_info->offset_vectype)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "%s index use not simple.\n", > + vls_type == VLS_LOAD ? "gather" : "scatter"); > + return false; > + } > + } > + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > + { > + if (!get_group_load_store_type (stmt, vectype, slp, vls_type, > + memory_access_type)) > + return false; > + } > + else if (STMT_VINFO_STRIDED_P (stmt_info)) > + { > + gcc_assert (!slp); > + *memory_access_type = VMAT_ELEMENTWISE; > + } > + else > + *memory_access_type = VMAT_CONTIGUOUS; > + > + /* FIXME: At the moment the cost model seems to underestimate the > + cost of using elementwise accesses. This check preserves the > + traditional behavior until that can be fixed. */ > + if (*memory_access_type == VMAT_ELEMENTWISE > + && !STMT_VINFO_STRIDED_P (stmt_info)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "not falling back to elementwise accesses\n"); > + return false; > + } > + return true; > +} > + > /* Function vectorizable_mask_load_store. > > Check if STMT performs a conditional load or store that can be vectorized. > @@ -1705,7 +1907,7 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > int i, j; > bool inv_p; > gather_scatter_info gs_info; > - bool is_store; > + vec_load_store_type vls_type; > tree mask; > gimple *def_stmt; > enum vect_def_type dt; > @@ -1716,7 +1918,6 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; > gcc_assert (ncopies >= 1); > > - is_store = gimple_call_internal_fn (stmt) == IFN_MASK_STORE; > mask = gimple_call_arg (stmt, 2); > > if (TREE_CODE (TREE_TYPE (mask)) != BOOLEAN_TYPE) > @@ -1743,12 +1944,6 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > > elem_type = TREE_TYPE (vectype); > > - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > - return false; > - > - if (STMT_VINFO_STRIDED_P (stmt_info)) > - return false; > - > if (TREE_CODE (mask) != SSA_NAME) > return false; > > @@ -1762,27 +1957,26 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > || TYPE_VECTOR_SUBPARTS (mask_vectype) != TYPE_VECTOR_SUBPARTS > (vectype)) > return false; > > - if (is_store) > + if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE) > { > tree rhs = gimple_call_arg (stmt, 3); > if (!vect_is_simple_use (rhs, loop_vinfo, &def_stmt, &dt, > &rhs_vectype)) > return false; > + if (dt == vect_constant_def || dt == vect_external_def) > + vls_type = VLS_STORE_INVARIANT; > + else > + vls_type = VLS_STORE; > } > + else > + vls_type = VLS_LOAD; > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - { > - gimple *def_stmt; > - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) > - gcc_unreachable (); > - if (!vect_is_simple_use (gs_info.offset, loop_vinfo, &def_stmt, > - &gs_info.offset_dt, &gs_info.offset_vectype)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "gather index use not simple."); > - return false; > - } > + vect_memory_access_type memory_access_type; > + if (!get_load_store_type (stmt, vectype, false, vls_type, > + &memory_access_type, &gs_info)) > + return false; > > + if (memory_access_type == VMAT_GATHER_SCATTER) > + { > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > tree masktype > = TREE_VALUE (TREE_CHAIN (TREE_CHAIN (TREE_CHAIN (arglist)))); > @@ -1794,6 +1988,14 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > return false; > } > } > + else if (memory_access_type != VMAT_CONTIGUOUS) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "unsupported access type for masked %s\n", > + vls_type == VLS_LOAD ? "load" : "store"); > + return false; > + } > else if (tree_int_cst_compare (nested_in_vect_loop > ? STMT_VINFO_DR_STEP (stmt_info) > : DR_STEP (dr), size_zero_node) <= 0) > @@ -1801,25 +2003,28 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > else if (!VECTOR_MODE_P (TYPE_MODE (vectype)) > || !can_vec_mask_load_store_p (TYPE_MODE (vectype), > TYPE_MODE (mask_vectype), > - !is_store) > + vls_type == VLS_LOAD) > || (rhs_vectype > && !useless_type_conversion_p (vectype, rhs_vectype))) > return false; > > if (!vec_stmt) /* transformation not required. */ > { > + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; > STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; > - if (is_store) > - vect_model_store_cost (stmt_info, ncopies, false, dt, > - NULL, NULL, NULL); > + if (vls_type == VLS_LOAD) > + vect_model_load_cost (stmt_info, ncopies, memory_access_type, > + NULL, NULL, NULL); > else > - vect_model_load_cost (stmt_info, ncopies, false, NULL, NULL, NULL); > + vect_model_store_cost (stmt_info, ncopies, memory_access_type, > + dt, NULL, NULL, NULL); > return true; > } > + gcc_assert (memory_access_type == STMT_VINFO_MEMORY_ACCESS_TYPE > (stmt_info)); > > /** Transform. **/ > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + if (memory_access_type == VMAT_GATHER_SCATTER) > { > tree vec_oprnd0 = NULL_TREE, op; > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > @@ -1993,7 +2198,7 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > gsi_replace (gsi, new_stmt, true); > return true; > } > - else if (is_store) > + else if (vls_type != VLS_LOAD) > { > tree vec_rhs = NULL_TREE, vec_mask = NULL_TREE; > prev_stmt_info = NULL; > @@ -2102,7 +2307,7 @@ vectorizable_mask_load_store (gimple *stmt, > gimple_stmt_iterator *gsi, > } > } > > - if (!is_store) > + if (vls_type == VLS_LOAD) > { > /* Ensure that even with -fno-tree-dce the scalar MASK_LOAD is removed > from the IL. */ > @@ -5188,9 +5393,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > gimple *ptr_incr = NULL; > int ncopies; > int j; > - gimple *next_stmt, *first_stmt = NULL; > - bool grouped_store = false; > - bool store_lanes_p = false; > + gimple *next_stmt, *first_stmt; > + bool grouped_store; > unsigned int group_size, i; > vec<tree> dr_chain = vNULL; > vec<tree> oprnds = vNULL; > @@ -5207,6 +5411,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > gather_scatter_info gs_info; > enum vect_def_type scatter_src_dt = vect_unknown_def_type; > gimple *new_stmt; > + vec_load_store_type vls_type; > > if (!STMT_VINFO_RELEVANT_P (stmt_info) && !bb_vinfo) > return false; > @@ -5274,6 +5479,11 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > return false; > } > > + if (dt == vect_constant_def || dt == vect_external_def) > + vls_type = VLS_STORE_INVARIANT; > + else > + vls_type = VLS_STORE; > + > if (rhs_vectype && !useless_type_conversion_p (vectype, rhs_vectype)) > return false; > > @@ -5303,7 +5513,6 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > } > if (negative) > { > - gcc_assert (!grouped_store); > alignment_support_scheme = vect_supportable_dr_alignment (dr, > false); > if (alignment_support_scheme != dr_aligned > && alignment_support_scheme != dr_unaligned_supported) > @@ -5325,80 +5534,31 @@ vectorizable_store (gimple *stmt, > gimple_stmt_iterator *gsi, gimple **vec_stmt, > } > } > > - if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > - { > - grouped_store = true; > - first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > - group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); > - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info)) > - { > - if (vect_store_lanes_supported (vectype, group_size)) > - store_lanes_p = true; > - else if (!vect_grouped_store_supported (vectype, group_size)) > - return false; > - } > - > - if (STMT_VINFO_STRIDED_P (stmt_info) > - && slp > - && (group_size > nunits > - || nunits % group_size != 0)) > - { > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "unhandled strided group store\n"); > - return false; > - } > - > - if (first_stmt == stmt) > - { > - /* STMT is the leader of the group. Check the operands of all the > - stmts of the group. */ > - next_stmt = GROUP_NEXT_ELEMENT (stmt_info); > - while (next_stmt) > - { > - gcc_assert (gimple_assign_single_p (next_stmt)); > - op = gimple_assign_rhs1 (next_stmt); > - if (!vect_is_simple_use (op, vinfo, &def_stmt, &dt)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "use not simple.\n"); > - return false; > - } > - next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt)); > - } > - } > - } > - > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - { > - gimple *def_stmt; > - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) > - gcc_unreachable (); > - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt, > - &gs_info.offset_dt, &gs_info.offset_vectype)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "scatter index use not simple."); > - return false; > - } > - } > + vect_memory_access_type memory_access_type; > + if (!get_load_store_type (stmt, vectype, slp, vls_type, > + &memory_access_type, &gs_info)) > + return false; > > if (!vec_stmt) /* transformation not required. */ > { > + if (!slp) > + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; > STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; > /* The SLP costs are calculated during SLP analysis. */ > if (!PURE_SLP_STMT (stmt_info)) > - vect_model_store_cost (stmt_info, ncopies, store_lanes_p, dt, > + vect_model_store_cost (stmt_info, ncopies, memory_access_type, dt, > NULL, NULL, NULL); > return true; > } > + if (!slp) > + gcc_assert (memory_access_type > + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); > > /** Transform. **/ > > ensure_base_align (stmt_info, dr); > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + if (memory_access_type == VMAT_GATHER_SCATTER) > { > tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE, op, src; > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > @@ -5538,8 +5698,10 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > return true; > } > > + grouped_store = STMT_VINFO_GROUPED_ACCESS (stmt_info); > if (grouped_store) > { > + first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); > group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); > > @@ -5585,7 +5747,8 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > dump_printf_loc (MSG_NOTE, vect_location, > "transform store. ncopies = %d\n", ncopies); > > - if (STMT_VINFO_STRIDED_P (stmt_info)) > + if (memory_access_type == VMAT_ELEMENTWISE > + || memory_access_type == VMAT_STRIDED_SLP) > { > gimple_stmt_iterator incr_gsi; > bool insert_after; > @@ -5756,14 +5919,14 @@ vectorizable_store (gimple *stmt, > gimple_stmt_iterator *gsi, gimple **vec_stmt, > gcc_assert (alignment_support_scheme); > /* Targets with store-lane instructions must not require explicit > realignment. */ > - gcc_assert (!store_lanes_p > + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES > || alignment_support_scheme == dr_aligned > || alignment_support_scheme == dr_unaligned_supported); > > if (negative) > offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); > > - if (store_lanes_p) > + if (memory_access_type == VMAT_LOAD_STORE_LANES) > aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); > else > aggr_type = vectype; > @@ -5901,7 +6064,7 @@ vectorizable_store (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > TYPE_SIZE_UNIT (aggr_type)); > } > > - if (store_lanes_p) > + if (memory_access_type == VMAT_LOAD_STORE_LANES) > { > tree vec_array; > > @@ -6185,7 +6348,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > gphi *phi = NULL; > vec<tree> dr_chain = vNULL; > bool grouped_load = false; > - bool load_lanes_p = false; > gimple *first_stmt; > gimple *first_stmt_for_drptr = NULL; > bool inv_p; > @@ -6294,48 +6456,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > { > grouped_load = true; > /* FORNOW */ > - gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P > (stmt_info)); > + gcc_assert (!nested_in_vect_loop); > + gcc_assert (!STMT_VINFO_GATHER_SCATTER_P (stmt_info)); > > first_stmt = GROUP_FIRST_ELEMENT (stmt_info); > group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt)); > - bool single_element_p = (first_stmt == stmt > - && !GROUP_NEXT_ELEMENT (stmt_info)); > - > - if (!slp && !STMT_VINFO_STRIDED_P (stmt_info)) > - { > - if (vect_load_lanes_supported (vectype, group_size)) > - load_lanes_p = true; > - else if (!vect_grouped_load_supported (vectype, single_element_p, > - group_size)) > - return false; > - } > - > - if (single_element_p) > - { > - /* Single-element interleaving requires peeling for gaps. */ > - gcc_assert (GROUP_GAP (stmt_info)); > - } > - > - /* If there is a gap in the end of the group then we access excess > - elements in the last iteration and thus need to peel that off. */ > - if (loop_vinfo > - && ! STMT_VINFO_STRIDED_P (stmt_info) > - && GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "Data access with gaps requires scalar " > - "epilogue loop\n"); > - if (loop->inner) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "Peeling for outer loop is not supported\n"); > - return false; > - } > - > - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; > - } > > if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) > slp_perm = true; > @@ -6381,24 +6506,13 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > } > } > > + vect_memory_access_type memory_access_type; > + if (!get_load_store_type (stmt, vectype, slp, VLS_LOAD, > + &memory_access_type, &gs_info)) > + return false; > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > - { > - gimple *def_stmt; > - if (!vect_check_gather_scatter (stmt, loop_vinfo, &gs_info)) > - gcc_unreachable (); > - if (!vect_is_simple_use (gs_info.offset, vinfo, &def_stmt, > - &gs_info.offset_dt, &gs_info.offset_vectype)) > - { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "gather index use not simple.\n"); > - return false; > - } > - } > - else if (STMT_VINFO_STRIDED_P (stmt_info)) > - ; > - else > + if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info) > + && !STMT_VINFO_STRIDED_P (stmt_info)) > { > negative = tree_int_cst_compare (nested_in_vect_loop > ? STMT_VINFO_DR_STEP (stmt_info) > @@ -6444,14 +6558,20 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > > if (!vec_stmt) /* transformation not required. */ > { > + if (!slp) > + STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) = memory_access_type; > STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; > /* The SLP costs are calculated during SLP analysis. */ > if (!PURE_SLP_STMT (stmt_info)) > - vect_model_load_cost (stmt_info, ncopies, load_lanes_p, > + vect_model_load_cost (stmt_info, ncopies, memory_access_type, > NULL, NULL, NULL); > return true; > } > > + if (!slp) > + gcc_assert (memory_access_type > + == STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info)); > + > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "transform load. ncopies = %d\n", ncopies); > @@ -6460,7 +6580,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > > ensure_base_align (stmt_info, dr); > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)) > + if (memory_access_type == VMAT_GATHER_SCATTER) > { > tree vec_oprnd0 = NULL_TREE, op; > tree arglist = TYPE_ARG_TYPES (TREE_TYPE (gs_info.decl)); > @@ -6627,7 +6747,9 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > } > return true; > } > - else if (STMT_VINFO_STRIDED_P (stmt_info)) > + > + if (memory_access_type == VMAT_ELEMENTWISE > + || memory_access_type == VMAT_STRIDED_SLP) > { > gimple_stmt_iterator incr_gsi; > bool insert_after; > @@ -6694,26 +6816,23 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > int lnel = 1; > tree ltype = TREE_TYPE (vectype); > auto_vec<tree> dr_chain; > - if (slp) > + if (memory_access_type == VMAT_STRIDED_SLP) > { > - if (group_size < nunits > - && nunits % group_size == 0) > + nloads = nunits / group_size; > + if (group_size < nunits) > { > - nloads = nunits / group_size; > lnel = group_size; > ltype = build_vector_type (TREE_TYPE (vectype), group_size); > - ltype = build_aligned_type (ltype, > - TYPE_ALIGN (TREE_TYPE (vectype))); > } > - else if (group_size >= nunits > - && group_size % nunits == 0) > + else > { > - nloads = 1; > lnel = nunits; > ltype = vectype; > - ltype = build_aligned_type (ltype, > - TYPE_ALIGN (TREE_TYPE (vectype))); > } > + ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE > (vectype))); > + } > + if (slp) > + { > /* For SLP permutation support we need to load the whole group, > not only the number of vector stmts the permutation result > fits in. */ > @@ -6845,7 +6964,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > gcc_assert (alignment_support_scheme); > /* Targets with load-lane instructions must not require explicit > realignment. */ > - gcc_assert (!load_lanes_p > + gcc_assert (memory_access_type != VMAT_LOAD_STORE_LANES > || alignment_support_scheme == dr_aligned > || alignment_support_scheme == dr_unaligned_supported); > > @@ -6980,7 +7099,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > if (negative) > offset = size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1); > > - if (load_lanes_p) > + if (memory_access_type == VMAT_LOAD_STORE_LANES) > aggr_type = build_array_type_nelts (elem_type, vec_num * nunits); > else > aggr_type = vectype; > @@ -7043,7 +7162,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > if (grouped_load || slp_perm) > dr_chain.create (vec_num); > > - if (load_lanes_p) > + if (memory_access_type == VMAT_LOAD_STORE_LANES) > { > tree vec_array; > > @@ -7313,7 +7432,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator > *gsi, gimple **vec_stmt, > { > if (grouped_load) > { > - if (!load_lanes_p) > + if (memory_access_type != VMAT_LOAD_STORE_LANES) > vect_transform_grouped_load (stmt, dr_chain, group_size, gsi); > *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); > }