Hi,
this is a refactor that moves the setting of slp_perm, checking of basic-block
SLP gaps, as well as the final check for perm_ok into get_load_store_type.
Also, slp_perm is moved to ls_data.
In order to help legibility, the patch introduces an enum
vect_gather_scatter_subtype that specifies the gather/scatter
"sub-type" like strided, or grouped_strided. It's only used in one place for
now, though.
Bootstrapped and regtested on x86 and power10. Regtested on aarch64 and
rv64gcv_zvl512b.
Regards
Robin
gcc/ChangeLog:
* tree-vect-stmts.cc (get_load_store_type): Add load-permutation
checks and setting of slp_perm.
(vectorizable_store): Remove perm_ok argument.
(vectorizable_load): Ditto and replace slp_perm by ls.slp_perm.
* tree-vectorizer.h (enum vect_gather_scatter_subtype): New.
(struct vect_load_store_data): Add slp_perm.
---
gcc/tree-vect-stmts.cc | 181 ++++++++++++++++++++++-------------------
gcc/tree-vectorizer.h | 9 ++
2 files changed, 105 insertions(+), 85 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index bf5a67bf805..ef33638af74 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2062,16 +2062,13 @@ vector_vector_composition_type (tree vtype, poly_uint64
nelts, tree *ptype,
VECTYPE is the vector type that the vectorized statements will use.
If ELSVALS is nonzero the supported else values will be stored in the
- vector ELSVALS points to.
-
- For loads PERM_OK indicates whether we can code generate a
- SLP_TREE_LOAD_PERMUTATION on the node. */
+ vector ELSVALS points to. */
static bool
get_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info,
tree vectype, slp_tree slp_node,
bool masked_p, vec_load_store_type vls_type,
- bool perm_ok, vect_load_store_data *ls)
+ vect_load_store_data *ls)
{
vect_memory_access_type *memory_access_type = &ls->memory_access_type;
poly_int64 *poffset = &ls->poffset;
@@ -2081,6 +2078,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
internal_fn *lanes_ifn = &ls->lanes_ifn;
vec<int> *elsvals = &ls->elsvals;
tree *ls_type = &ls->ls_type;
+ bool *slp_perm = &ls->slp_perm;
+ unsigned *n_perms = &ls->n_perms;
loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
class loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
@@ -2093,6 +2092,15 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
*misalignment = DR_MISALIGNMENT_UNKNOWN;
*poffset = 0;
*ls_type = NULL_TREE;
+ *slp_perm = false;
+ *n_perms = -1U;
+
+ bool perm_ok = true;
+ poly_int64 vf = loop_vinfo ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
+
+ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
+ perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
+ vf, true, n_perms);
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
{
@@ -2131,6 +2139,10 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
if (! SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]);
+ /* Sub-type of gather/scatter. */
+ vect_gather_scatter_subtype gather_scatter_subtype
+ = GATHER_SCATTER_UNDEFINED;
+
if (STMT_VINFO_STRIDED_P (first_stmt_info))
/* Try to use consecutive accesses of as many elements as possible,
separated by the stride, until we have a complete vector.
@@ -2181,6 +2193,8 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
}
*memory_access_type = VMAT_GATHER_SCATTER_EMULATED;
}
+
+ gather_scatter_subtype = GATHER_SCATTER_REGULAR;
}
else
{
@@ -2418,6 +2432,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
ls->gs.ifn = gs_info.ifn;
ls->strided_offset_vectype = gs_info.offset_vectype;
*memory_access_type = VMAT_GATHER_SCATTER_IFN;
+ gather_scatter_subtype = GATHER_SCATTER_STRIDED;
}
else if (SLP_TREE_LANES (slp_node) > 1
&& !masked_p
@@ -2431,6 +2446,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
SLP_TREE_GS_BASE (slp_node) = error_mark_node;
grouped_gather_fallback = *memory_access_type;
*memory_access_type = VMAT_GATHER_SCATTER_IFN;
+ gather_scatter_subtype = GATHER_SCATTER_STRIDED_GROUPED;
ls->gs.ifn = gs_info.ifn;
vectype = *ls_type;
ls->strided_offset_vectype = gs_info.offset_vectype;
@@ -2534,7 +2550,7 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
poly_uint64 read_amount
= vf * TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
- read_amount *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
+ read_amount *= group_size;
auto target_alignment
= DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
@@ -2627,6 +2643,58 @@ get_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
if (!loop_vinfo && *memory_access_type == VMAT_ELEMENTWISE)
return false;
+ /* Some loads need to explicitly permute the loaded data if there
+ is a load permutation. Among those are:
+ - VMAT_ELEMENTWISE.
+ - VMAT_STRIDED_SLP.
+ - VMAT_GATHER_SCATTER:
+ - Strided gather (fallback for VMAT_STRIDED_SLP if #lanes == 1).
+ - Grouped strided gather (ditto but for #lanes > 1).
+
+ For VMAT_ELEMENTWISE we can fold the load permutation into the
+ individual indices we access directly, eliding the permutation.
+ Strided gather only allows load permutations for the
+ single-element case. */
+
+ if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+ && !(*memory_access_type == VMAT_ELEMENTWISE
+ || gather_scatter_subtype == GATHER_SCATTER_STRIDED))
+ {
+ if (!loop_vinfo)
+ {
+ /* In BB vectorization we may not actually use a loaded vector
+ accessing elements in excess of DR_GROUP_SIZE. */
+ stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
+ group_info = DR_GROUP_FIRST_ELEMENT (group_info);
+ unsigned HOST_WIDE_INT nunits;
+ unsigned j, k, maxk = 0;
+ FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
+ if (k > maxk)
+ maxk = k;
+ tree vectype = SLP_TREE_VECTYPE (slp_node);
+ if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
+ || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "BB vectorization with gaps at the end of "
+ "a load is not supported\n");
+ return false;
+ }
+ }
+
+ if (!perm_ok)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+ vect_location,
+ "unsupported load permutation\n");
+ return false;
+ }
+
+ *slp_perm = true;
+ }
+
return true;
}
@@ -8002,7 +8070,7 @@ vectorizable_store (vec_info *vinfo,
vect_load_store_data &ls = slp_node->get_data (_ls_data);
if (cost_vec
&& !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
- vls_type, false, &_ls_data))
+ vls_type, &_ls_data))
return false;
/* Temporary aliases to analysis data, should not be modified through
these. */
@@ -9446,7 +9514,6 @@ vectorizable_load (vec_info *vinfo,
bool compute_in_loop = false;
class loop *at_loop;
int vec_num;
- bool slp_perm = false;
bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo);
poly_uint64 vf;
tree aggr_type;
@@ -9584,17 +9651,11 @@ vectorizable_load (vec_info *vinfo,
else
group_size = 1;
- bool perm_ok = true;
- unsigned n_perms = -1U;
- if (cost_vec && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
- perm_ok = vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL, vf,
- true, &n_perms);
-
vect_load_store_data _ls_data{};
vect_load_store_data &ls = slp_node->get_data (_ls_data);
if (cost_vec
&& !get_load_store_type (vinfo, stmt_info, vectype, slp_node, mask_node,
- VLS_LOAD, perm_ok, &ls))
+ VLS_LOAD, &ls))
return false;
/* Temporary aliases to analysis data, should not be modified through
these. */
@@ -9615,56 +9676,6 @@ vectorizable_load (vec_info *vinfo,
bool type_mode_padding_p
= TYPE_PRECISION (scalar_type) < GET_MODE_PRECISION (GET_MODE_INNER
(mode));
- /* ??? The following checks should really be part of
- get_load_store_type. */
- if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
- && !(memory_access_type == VMAT_ELEMENTWISE
- || (mat_gather_scatter_p (memory_access_type)
- && SLP_TREE_LANES (slp_node) == 1
- && (!grouped_load
- || !DR_GROUP_NEXT_ELEMENT (first_stmt_info)))))
- {
- slp_perm = true;
-
- if (!loop_vinfo && cost_vec)
- {
- /* In BB vectorization we may not actually use a loaded vector
- accessing elements in excess of DR_GROUP_SIZE. */
- stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0];
- group_info = DR_GROUP_FIRST_ELEMENT (group_info);
- unsigned HOST_WIDE_INT nunits;
- unsigned j, k, maxk = 0;
- FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k)
- if (k > maxk)
- maxk = k;
- tree vectype = SLP_TREE_VECTYPE (slp_node);
- if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)
- || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1)))
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "BB vectorization with gaps at the end of "
- "a load is not supported\n");
- return false;
- }
- }
-
- if (cost_vec)
- {
- if (!perm_ok)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION,
- vect_location,
- "unsupported load permutation\n");
- return false;
- }
- ls.n_perms = n_perms;
- }
- else
- n_perms = ls.n_perms;
- }
-
if (slp_node->ldst_lanes
&& memory_access_type != VMAT_LOAD_STORE_LANES)
{
@@ -10019,7 +10030,7 @@ vectorizable_load (vec_info *vinfo,
not only the number of vector stmts the permutation result
fits in. */
int ncopies;
- if (slp_perm)
+ if (ls.slp_perm)
{
gcc_assert (memory_access_type != VMAT_ELEMENTWISE);
/* We don't yet generate SLP_TREE_LOAD_PERMUTATIONs for
@@ -10127,18 +10138,18 @@ vectorizable_load (vec_info *vinfo,
if (!costing_p)
{
- if (slp_perm)
+ if (ls.slp_perm)
dr_chain.quick_push (gimple_assign_lhs (new_stmt));
else
slp_node->push_vec_def (new_stmt);
}
}
- if (slp_perm)
+ if (ls.slp_perm)
{
if (costing_p)
{
- gcc_assert (n_perms != -1U);
- inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
+ gcc_assert (ls.n_perms != -1U);
+ inside_cost += record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
slp_node, 0, vect_body);
}
else
@@ -10146,7 +10157,7 @@ vectorizable_load (vec_info *vinfo,
unsigned n_perms2;
vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf,
false, &n_perms2);
- gcc_assert (n_perms == n_perms2);
+ gcc_assert (ls.n_perms == n_perms2);
}
}
@@ -10211,7 +10222,7 @@ vectorizable_load (vec_info *vinfo,
instead the access is contiguous but it might be
permuted. No gap adjustment is needed though. */
;
- else if (slp_perm
+ else if (ls.slp_perm
&& (group_size != scalar_lanes
|| !multiple_p (nunits, group_size)))
{
@@ -10560,7 +10571,7 @@ vectorizable_load (vec_info *vinfo,
if (mat_gather_scatter_p (memory_access_type))
{
- gcc_assert ((!grouped_load && !slp_perm) || ls.ls_type);
+ gcc_assert ((!grouped_load && !ls.slp_perm) || ls.ls_type);
/* If we pun the original vectype the loads as well as costing, length,
etc. is performed with the new type. After loading we VIEW_CONVERT
@@ -10922,14 +10933,14 @@ vectorizable_load (vec_info *vinfo,
/* Store vector loads in the corresponding SLP_NODE. */
if (!costing_p)
{
- if (slp_perm)
+ if (ls.slp_perm)
dr_chain.quick_push (gimple_assign_lhs (new_stmt));
else
slp_node->push_vec_def (new_stmt);
}
}
- if (slp_perm)
+ if (ls.slp_perm)
{
if (costing_p)
{
@@ -11026,7 +11037,7 @@ vectorizable_load (vec_info *vinfo,
stmt_info, bump);
}
- if (grouped_load || slp_perm)
+ if (grouped_load || ls.slp_perm)
dr_chain.create (vec_num);
gimple *new_stmt = NULL;
@@ -11523,11 +11534,11 @@ vectorizable_load (vec_info *vinfo,
/* Collect vector loads and later create their permutation in
vect_transform_slp_perm_load. */
- if (!costing_p && (grouped_load || slp_perm))
+ if (!costing_p && (grouped_load || ls.slp_perm))
dr_chain.quick_push (new_temp);
/* Store vector loads in the corresponding SLP_NODE. */
- if (!costing_p && !slp_perm)
+ if (!costing_p && !ls.slp_perm)
slp_node->push_vec_def (new_stmt);
/* With SLP permutation we load the gaps as well, without
@@ -11536,7 +11547,7 @@ vectorizable_load (vec_info *vinfo,
group_elt += nunits;
if (!costing_p
&& maybe_ne (group_gap_adj, 0U)
- && !slp_perm
+ && !ls.slp_perm
&& known_eq (group_elt, group_size - group_gap_adj))
{
poly_wide_int bump_val
@@ -11553,7 +11564,7 @@ vectorizable_load (vec_info *vinfo,
elements loaded for a permuted SLP load. */
if (!costing_p
&& maybe_ne (group_gap_adj, 0U)
- && slp_perm)
+ && ls.slp_perm)
{
poly_wide_int bump_val
= (wi::to_wide (TYPE_SIZE_UNIT (elem_type)) * group_gap_adj);
@@ -11564,7 +11575,7 @@ vectorizable_load (vec_info *vinfo,
stmt_info, bump);
}
- if (slp_perm)
+ if (ls.slp_perm)
{
/* For SLP we know we've seen all possible uses of dr_chain so
direct vect_transform_slp_perm_load to DCE the unused parts.
@@ -11572,9 +11583,9 @@ vectorizable_load (vec_info *vinfo,
in PR101120 and friends. */
if (costing_p)
{
- gcc_assert (n_perms != -1U);
- if (n_perms != 0)
- inside_cost = record_stmt_cost (cost_vec, n_perms, vec_perm,
+ gcc_assert (ls.n_perms != -1U);
+ if (ls.n_perms != 0)
+ inside_cost = record_stmt_cost (cost_vec, ls.n_perms, vec_perm,
slp_node, 0, vect_body);
}
else
@@ -11583,7 +11594,7 @@ vectorizable_load (vec_info *vinfo,
bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain,
gsi, vf, false, &n_perms2,
nullptr, true);
- gcc_assert (ok && n_perms == n_perms2);
+ gcc_assert (ok && ls.n_perms == n_perms2);
}
dr_chain.release ();
}
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 52bc0d672bf..d2c5f2ba51f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -209,6 +209,13 @@ enum vect_memory_access_type {
VMAT_GATHER_SCATTER_EMULATED
};
+enum vect_gather_scatter_subtype {
+ GATHER_SCATTER_UNDEFINED,
+ GATHER_SCATTER_REGULAR,
+ GATHER_SCATTER_STRIDED,
+ GATHER_SCATTER_STRIDED_GROUPED
+};
+
/* Returns whether MAT is any of the VMAT_GATHER_SCATTER_* kinds. */
inline bool
@@ -290,6 +297,8 @@ struct vect_load_store_data : vect_data {
tree strided_offset_vectype; // VMAT_GATHER_SCATTER_IFN, originally strided
tree ls_type; // VMAT_GATHER_SCATTER_IFN
auto_vec<int> elsvals;
+ /* True if the load requires a load permutation. */
+ bool slp_perm; // SLP_TREE_LOAD_PERMUTATION
unsigned n_perms; // SLP_TREE_LOAD_PERMUTATION
};
--
2.51.0