Richard Biener <rguent...@suse.de> writes: > On Mon, 14 Feb 2022, Richard Sandiford wrote: > >> ldp_stp_1.c, ldp_stp_4.c and ldp_stp_5.c have been failing since >> vectorisation was enabled at -O2. In all three cases SLP is >> generating vector code when scalar code would be better. >> >> The problem is that the target costs do not model whether STP could >> be used for the scalar or vector code, so the normal latency-based >> costs for store-heavy code can be way off. It would be good to fix >> that ?properly? at some point, but it isn't easy; see the existing >> discussion in aarch64_sve_adjust_stmt_cost for more details. >> >> This patch therefore adds an on-the-side check for whether the >> code is doing nothing more than set-up+stores. It then applies >> STP-based costs to those cases only, in addition to the normal >> latency-based costs. (That is, the vector code has to win on >> both counts rather than on one count individually.) >> >> However, at the moment, SLP costs one vector set-up instruction >> for every vector in an SLP node, even if the contents are the >> same as a previous vector in the same node. Fixing the STP costs >> without fixing that would regress other cases, tested in the patch. >> >> The patch therefore makes the SLP costing code check for duplicates >> within a node. Ideally we'd check for duplicates more globally, >> but that would require a more global approach to costs: the cost >> of an initialisation should be amoritised across all trees that >> use the initialisation, rather than fully counted against one >> arbitrarily-chosen subtree. >> >> Back on aarch64: an earlier version of the patch tried to apply >> the new heuristic to constant stores. However, that didn't work >> too well in practice; see the comments for details. The patch >> therefore just tests the status quo for constant cases, leaving out >> a match if the current choice is dubious. >> >> ldp_stp_5.c was affected by the same thing. The test would be >> worth vectorising if we generated better vector code, but: >> >> (1) We do a bad job of moving the { -1, 1 } constant, given that >> we have { -1, -1 } and { 1, 1 } to hand. >> >> (2) The vector code has 6 pairable stores to misaligned offsets. >> We have peephole patterns to handle such misalignment for >> 4 pairable stores, but not 6. >> >> So the SLP decision isn't wrong as such. It's just being let >> down by later codegen. >> >> The patch therefore adds -mstrict-align to preserve the original >> intention of the test while adding ldp_stp_19.c to check for the >> preferred vector code (XFAILed for now). >> >> Tested on aarch64-linux-gnu, aarch64_be-elf and x86_64-linux-gnu. >> OK for the vectoriser bits? > > I'll look at the patch tomorrow but it reminded me of an old > patch I'm still sitting on which reworked the SLP discovery > cache to be based on defs rather than stmts which allows us to > cache and re-use SLP nodes for invariants during SLP discovery.
Ah, yeah, that should help with the “more global” bit. I think in the end we need both though: reduce duplicate nodes, and remove duplicate vectors (or at least duplicate vector costs) within a node. Thanks, Richard > From 8df9c7003611e690bd08fd5cff0b624527c99bf4 Mon Sep 17 00:00:00 2001 > From: Richard Biener <rguent...@suse.de> > Date: Fri, 20 Mar 2020 11:42:47 +0100 > Subject: [PATCH] rework SLP caching based on ops and CSE constants > To: gcc-patches@gcc.gnu.org > > This reworks SLP caching so that it keys on the defs and not > their defining stmts so we can use it to CSE SLP nodes for > constants and invariants. > > 2020-03-19 Richard Biener <rguent...@suse.de> > > * tree-vect-slp.c (): ... > --- > gcc/tree-vect-slp.c | 222 +++++++++++++++++++++++++++++--------------- > 1 file changed, 149 insertions(+), 73 deletions(-) > > diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c > index 1ffbf6f6af9..e545e34e353 100644 > --- a/gcc/tree-vect-slp.c > +++ b/gcc/tree-vect-slp.c > @@ -129,36 +129,38 @@ vect_free_slp_instance (slp_instance instance, bool > final_p) > free (instance); > } > > - > -/* Create an SLP node for SCALAR_STMTS. */ > - > -static slp_tree > -vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops) > -{ > - slp_tree node = new _slp_tree; > - SLP_TREE_SCALAR_STMTS (node) = scalar_stmts; > - SLP_TREE_CHILDREN (node).create (nops); > - SLP_TREE_DEF_TYPE (node) = vect_internal_def; > - SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0]; > - SLP_TREE_LANES (node) = scalar_stmts.length (); > - > - unsigned i; > - stmt_vec_info stmt_info; > - FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info) > - STMT_VINFO_NUM_SLP_USES (stmt_info)++; > - > - return node; > -} > - > /* Create an SLP node for OPS. */ > > static slp_tree > -vect_create_new_slp_node (vec<tree> ops) > +vect_create_new_slp_node (vec_info *vinfo, > + vec<tree> ops, unsigned nops = 0, > + vect_def_type def_type = vect_external_def) > { > slp_tree node = new _slp_tree; > SLP_TREE_SCALAR_OPS (node) = ops; > - SLP_TREE_DEF_TYPE (node) = vect_external_def; > SLP_TREE_LANES (node) = ops.length (); > + if (nops != 0 > + || (def_type != vect_external_def && def_type != vect_constant_def)) > + { > + if (nops != 0) > + SLP_TREE_CHILDREN (node).create (nops); > + SLP_TREE_DEF_TYPE (node) = vect_internal_def; > + > + SLP_TREE_SCALAR_STMTS (node).create (ops.length ()); > + unsigned i; > + tree op; > + FOR_EACH_VEC_ELT (ops, i, op) > + { > + stmt_vec_info stmt_info = vinfo->lookup_def (op); > + STMT_VINFO_NUM_SLP_USES (stmt_info)++; > + SLP_TREE_SCALAR_STMTS (node).quick_push (stmt_info); > + if (i == 0) > + SLP_TREE_REPRESENTATIVE (node) = stmt_info; > + } > + } > + else > + SLP_TREE_DEF_TYPE (node) = vect_external_def; > + > return node; > } > > @@ -168,8 +170,6 @@ vect_create_new_slp_node (vec<tree> ops) > node. */ > typedef struct _slp_oprnd_info > { > - /* Def-stmts for the operands. */ > - vec<stmt_vec_info> def_stmts; > /* Operands. */ > vec<tree> ops; > /* Information about the first statement, its vector def-type, type, the > @@ -194,7 +194,6 @@ vect_create_oprnd_info (int nops, int group_size) > for (i = 0; i < nops; i++) > { > oprnd_info = XNEW (struct _slp_oprnd_info); > - oprnd_info->def_stmts.create (group_size); > oprnd_info->ops.create (group_size); > oprnd_info->first_dt = vect_uninitialized_def; > oprnd_info->first_op_type = NULL_TREE; > @@ -216,7 +215,6 @@ vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info) > > FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info) > { > - oprnd_info->def_stmts.release (); > oprnd_info->ops.release (); > XDELETE (oprnd_info); > } > @@ -459,7 +457,10 @@ again: > } > > if (def_stmt_info && is_pattern_stmt_p (def_stmt_info)) > - oprnd_info->any_pattern = true; > + { > + oprnd_info->any_pattern = true; > + oprnd = gimple_get_lhs (def_stmt_info->stmt); > + } > > if (first) > { > @@ -541,7 +542,6 @@ again: > oprnd_info->first_dt = vect_external_def; > /* Fallthru. */ > case vect_constant_def: > - oprnd_info->def_stmts.quick_push (NULL); > oprnd_info->ops.quick_push (oprnd); > break; > > @@ -559,13 +559,11 @@ again: > us a sane SLP graph (still the stmts are not 100% > correct wrt the initial values). */ > gcc_assert (!first); > - oprnd_info->def_stmts.quick_push (oprnd_info->def_stmts[0]); > oprnd_info->ops.quick_push (oprnd_info->ops[0]); > break; > } > /* Fallthru. */ > case vect_induction_def: > - oprnd_info->def_stmts.quick_push (def_stmt_info); > oprnd_info->ops.quick_push (oprnd); > break; > > @@ -1096,8 +1094,8 @@ vect_build_slp_tree_1 (vec_info *vinfo, unsigned char > *swap, > need a special value for deleted that differs from empty. */ > struct bst_traits > { > - typedef vec <stmt_vec_info> value_type; > - typedef vec <stmt_vec_info> compare_type; > + typedef vec <tree> value_type; > + typedef vec <tree> compare_type; > static inline hashval_t hash (value_type); > static inline bool equal (value_type existing, value_type candidate); > static inline bool is_empty (value_type x) { return !x.exists (); } > @@ -1112,7 +1110,10 @@ bst_traits::hash (value_type x) > { > inchash::hash h; > for (unsigned i = 0; i < x.length (); ++i) > - h.add_int (gimple_uid (x[i]->stmt)); > + /* ??? FP constants are not shared so we can't use simple > + pointer hashing and equivalence which would work if we'd > + just care for SSA names here. */ > + inchash::add_expr (x[i], h, 0); > return h.end (); > } > inline bool > @@ -1121,30 +1122,33 @@ bst_traits::equal (value_type existing, value_type > candidate) > if (existing.length () != candidate.length ()) > return false; > for (unsigned i = 0; i < existing.length (); ++i) > - if (existing[i] != candidate[i]) > + if (existing[i] != candidate[i] > + && (!types_compatible_p (TREE_TYPE (existing[i]), > + TREE_TYPE (candidate[i])) > + || !operand_equal_p (existing[i], candidate[i], 0))) > return false; > return true; > } > > -typedef hash_map <vec <gimple *>, slp_tree, > +typedef hash_map <vec <tree>, slp_tree, > simple_hashmap_traits <bst_traits, slp_tree> > > scalar_stmts_to_slp_tree_map_t; > > static slp_tree > vect_build_slp_tree_2 (vec_info *vinfo, > - vec<stmt_vec_info> stmts, unsigned int group_size, > + vec<tree> defs, unsigned int group_size, > poly_uint64 *max_nunits, > bool *matches, unsigned *npermutes, unsigned *tree_size, > scalar_stmts_to_slp_tree_map_t *bst_map); > > static slp_tree > vect_build_slp_tree (vec_info *vinfo, > - vec<stmt_vec_info> stmts, unsigned int group_size, > + vec<tree> defs, unsigned int group_size, > poly_uint64 *max_nunits, > bool *matches, unsigned *npermutes, unsigned *tree_size, > scalar_stmts_to_slp_tree_map_t *bst_map) > { > - if (slp_tree *leader = bst_map->get (stmts)) > + if (slp_tree *leader = bst_map->get (defs)) > { > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n", > @@ -1157,7 +1161,7 @@ vect_build_slp_tree (vec_info *vinfo, > return *leader; > } > poly_uint64 this_max_nunits = 1; > - slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, > + slp_tree res = vect_build_slp_tree_2 (vinfo, defs, group_size, > &this_max_nunits, > matches, npermutes, tree_size, bst_map); > if (res) > @@ -1167,7 +1171,7 @@ vect_build_slp_tree (vec_info *vinfo, > /* Keep a reference for the bst_map use. */ > res->refcnt++; > } > - bst_map->put (stmts.copy (), res); > + bst_map->put (defs.copy (), res); > return res; > } > > @@ -1180,7 +1184,7 @@ vect_build_slp_tree (vec_info *vinfo, > > static slp_tree > vect_build_slp_tree_2 (vec_info *vinfo, > - vec<stmt_vec_info> stmts, unsigned int group_size, > + vec<tree> defs, unsigned int group_size, > poly_uint64 *max_nunits, > bool *matches, unsigned *npermutes, unsigned *tree_size, > scalar_stmts_to_slp_tree_map_t *bst_map) > @@ -1189,8 +1193,54 @@ vect_build_slp_tree_2 (vec_info *vinfo, > poly_uint64 this_max_nunits = *max_nunits; > slp_tree node; > > - matches[0] = false; > + auto_vec<stmt_vec_info> stmts; > + stmts.create (defs.length ()); > + vect_def_type dt; > + vect_def_type def0_type = vect_constant_def; > + stmt_vec_info def_info; > + if (!vect_is_simple_use (defs[0], vinfo, &def0_type, &def_info)) > + return NULL; > + stmts.quick_push (def_info); > + /* Fail gracefully to allow eventual splitting. */ > + matches[0] = true; > + bool fail = false; > + for (i = 1; i < defs.length (); ++i) > + { > + if (!vect_is_simple_use (defs[i], vinfo, &dt, &def_info)) > + return NULL; > + stmts.quick_push (def_info); > + if ((def0_type == vect_constant_def > + || def0_type == vect_external_def) > + != (dt == vect_constant_def > + || dt == vect_external_def)) > + { > + matches[i] = false; > + fail = true; > + } > + else > + matches[i] = true; > + if (dt == vect_external_def > + && def0_type == vect_constant_def) > + def0_type = vect_external_def; > + } > + /* Deal with mismatches in internal vs. invariant/external defs. */ > + if (fail) > + return NULL; > + if (def0_type == vect_external_def > + || def0_type == vect_constant_def) > + { > + tree scalar_type = TREE_TYPE (defs[0]); > + tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type, > + group_size); > + if (!vect_record_max_nunits (vinfo, NULL, group_size, vectype, > + max_nunits)) > + return NULL; > + node = vect_create_new_slp_node (vinfo, defs, 0, def0_type); > + SLP_TREE_VECTYPE (node) = vectype; > + return node; > + } > > + matches[0] = false; > stmt_vec_info stmt_info = stmts[0]; > if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt)) > nops = gimple_call_num_args (stmt); > @@ -1237,7 +1287,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, > else > return NULL; > (*tree_size)++; > - node = vect_create_new_slp_node (stmts, 0); > + node = vect_create_new_slp_node (vinfo, defs, 0, vect_internal_def); > SLP_TREE_VECTYPE (node) = vectype; > return node; > } > @@ -1325,23 +1375,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, > continue; > } > > - if (oprnd_info->first_dt != vect_internal_def > - && oprnd_info->first_dt != vect_reduction_def > - && oprnd_info->first_dt != vect_induction_def) > - { > - slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops); > - SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt; > - oprnd_info->ops = vNULL; > - children.safe_push (invnode); > - continue; > - } > - > - if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, > + if ((child = vect_build_slp_tree (vinfo, oprnd_info->ops, > group_size, &this_max_nunits, > matches, npermutes, > &this_tree_size, bst_map)) != NULL) > { > - oprnd_info->def_stmts = vNULL; > + oprnd_info->ops = vNULL; > children.safe_push (child); > continue; > } > @@ -1366,10 +1405,9 @@ vect_build_slp_tree_2 (vec_info *vinfo, > dump_printf_loc (MSG_NOTE, vect_location, > "Building vector operands from scalars\n"); > this_tree_size++; > - child = vect_create_new_slp_node (oprnd_info->ops); > + child = vect_create_new_slp_node (vinfo, oprnd_info->ops); > children.safe_push (child); > oprnd_info->ops = vNULL; > - oprnd_info->def_stmts = vNULL; > continue; > } > > @@ -1424,8 +1462,6 @@ vect_build_slp_tree_2 (vec_info *vinfo, > for (j = 0; j < group_size; ++j) > if (matches[j] == !swap_not_matching) > { > - std::swap (oprnds_info[0]->def_stmts[j], > - oprnds_info[1]->def_stmts[j]); > std::swap (oprnds_info[0]->ops[j], > oprnds_info[1]->ops[j]); > if (dump_enabled_p ()) > @@ -1435,12 +1471,12 @@ vect_build_slp_tree_2 (vec_info *vinfo, > dump_printf (MSG_NOTE, "\n"); > /* And try again with scratch 'matches' ... */ > bool *tem = XALLOCAVEC (bool, group_size); > - if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts, > + if ((child = vect_build_slp_tree (vinfo, oprnd_info->ops, > group_size, &this_max_nunits, > tem, npermutes, > &this_tree_size, bst_map)) != NULL) > { > - oprnd_info->def_stmts = vNULL; > + oprnd_info->ops = vNULL; > children.safe_push (child); > continue; > } > @@ -1513,7 +1549,7 @@ fail: > > /* Here we record the original defs since this > node represents the final lane configuration. */ > - node = vect_create_new_slp_node (stmts, 2); > + node = vect_create_new_slp_node (vinfo, defs, 2); > SLP_TREE_VECTYPE (node) = vectype; > SLP_TREE_CODE (node) = VEC_PERM_EXPR; > SLP_TREE_CHILDREN (node).quick_push (one); > @@ -1544,7 +1580,7 @@ fail: > return node; > } > > - node = vect_create_new_slp_node (stmts, nops); > + node = vect_create_new_slp_node (vinfo, defs, nops); > SLP_TREE_VECTYPE (node) = vectype; > SLP_TREE_CHILDREN (node).splice (children); > return node; > @@ -2070,12 +2106,35 @@ vect_analyze_slp_instance (vec_info *vinfo, > /* Create a node (a root of the SLP tree) for the packed grouped stores. > */ > scalar_stmts.create (group_size); > stmt_vec_info next_info = stmt_info; > + vec<tree> defs; > + defs.create (group_size); > if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > { > /* Collect the stores and store them in SLP_TREE_SCALAR_STMTS. */ > while (next_info) > { > + /* Just needed for the root SLP node, otherwise "wrong". */ > scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); > + /* Defs to seed the SLP tree from (excluding the store itself). */ > + tree def > + = gimple_assign_rhs1 (vect_stmt_to_vectorize (next_info)->stmt); > + if (stmt_vec_info defstmt = vinfo->lookup_def (def)) > + def = gimple_get_lhs (vect_stmt_to_vectorize (defstmt)->stmt); > + defs.safe_push (def); > + if (is_a <bb_vec_info> (vinfo)) > + { > + /* For BB vectorization we have to perform late vectype > + assignment to stores. */ > + tree vectype, nunits_vectype; > + if (!vect_get_vector_types_for_stmt (vinfo, next_info, &vectype, > + &nunits_vectype, group_size) > + || !vect_update_shared_vectype (next_info, vectype)) > + { > + defs.release (); > + scalar_stmts.release (); > + return false; > + } > + } > next_info = DR_GROUP_NEXT_ELEMENT (next_info); > } > } > @@ -2085,7 +2144,9 @@ vect_analyze_slp_instance (vec_info *vinfo, > SLP_TREE_SCALAR_STMTS. */ > while (next_info) > { > - scalar_stmts.safe_push (vect_stmt_to_vectorize (next_info)); > + stmt_vec_info def_info = vect_stmt_to_vectorize (next_info); > + scalar_stmts.safe_push (def_info); > + defs.quick_push (gimple_get_lhs (def_info->stmt)); > next_info = REDUC_GROUP_NEXT_ELEMENT (next_info); > } > /* Mark the first element of the reduction chain as reduction to > properly > @@ -2110,7 +2171,7 @@ vect_analyze_slp_instance (vec_info *vinfo, > if (!def_info) > return false; > def_info = vect_stmt_to_vectorize (def_info); > - scalar_stmts.safe_push (def_info); > + defs.quick_push (gimple_get_lhs (def_info->stmt)); > } > else > return false; > @@ -2125,7 +2186,7 @@ vect_analyze_slp_instance (vec_info *vinfo, > /* Collect reduction statements. */ > vec<stmt_vec_info> reductions = as_a <loop_vec_info> > (vinfo)->reductions; > for (i = 0; reductions.iterate (i, &next_info); i++) > - scalar_stmts.safe_push (next_info); > + defs.quick_push (gimple_get_lhs (next_info->stmt)); > } > > /* Build the tree for the SLP instance. */ > @@ -2133,7 +2194,7 @@ vect_analyze_slp_instance (vec_info *vinfo, > unsigned npermutes = 0; > poly_uint64 max_nunits = nunits; > unsigned tree_size = 0; > - node = vect_build_slp_tree (vinfo, scalar_stmts, group_size, > + node = vect_build_slp_tree (vinfo, defs, group_size, > &max_nunits, matches, &npermutes, > &tree_size, bst_map); > if (node != NULL) > @@ -2238,11 +2299,11 @@ vect_analyze_slp_instance (vec_info *vinfo, > gcc_assert (r); > next_info = vinfo->lookup_stmt (use_stmt); > next_info = vect_stmt_to_vectorize (next_info); > - scalar_stmts = vNULL; > - scalar_stmts.create (group_size); > + vec<tree> scalar_ops; > + scalar_ops.create (group_size); > for (unsigned i = 0; i < group_size; ++i) > - scalar_stmts.quick_push (next_info); > - slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1); > + scalar_ops.quick_push (gimple_get_lhs (next_info->stmt)); > + slp_tree conv = vect_create_new_slp_node (vinfo, scalar_ops, 1); > SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info); > SLP_TREE_CHILDREN (conv).quick_push (node); > SLP_INSTANCE_TREE (new_instance) = conv; > @@ -2252,6 +2313,21 @@ vect_analyze_slp_instance (vec_info *vinfo, > REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info; > REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL; > } > + else if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > + { > + /* Put the root store group in. */ > + slp_tree store = vect_create_new_slp_node (vinfo, vNULL, 1, > + vect_internal_def); > + SLP_TREE_SCALAR_STMTS (store) = scalar_stmts; > + stmt_vec_info stmt; > + FOR_EACH_VEC_ELT (scalar_stmts, i, stmt) > + STMT_VINFO_NUM_SLP_USES (stmt)++; > + SLP_TREE_REPRESENTATIVE (store) = scalar_stmts[0]; > + SLP_TREE_VECTYPE (store) = STMT_VINFO_VECTYPE (scalar_stmts[0]); > + SLP_TREE_LANES (store) = scalar_stmts.length (); > + SLP_TREE_CHILDREN (store).quick_push (node); > + SLP_INSTANCE_TREE (new_instance) = store; > + } > > vinfo->slp_instances.safe_push (new_instance);