On Tue, Sep 17, 2024 at 4:36 AM Andrew Pinski <quic_apin...@quicinc.com> wrote: > > This adds simple_dce_worklist to both the SLP vectorizer and the loop based > vectorizer. > This is a step into removing the dce after the loop based vectorizer. That > DCE still > does a few things, removing some of the induction variables which has become > unused. That is > something which can be improved afterwards. > > Note this adds it to the SLP BB vectorizer too as it is used from the loop > based one sometimes. > In the case of the BB SLP vectorizer, the dead statements don't get removed > until much later in > DSE so removing them much earlier is important. > > Note on the new testcase, it came up during bootstrap where the SLP pass > would cause the need to > invalidate the scev caches but there was no testcase for this beforehand so > adding one is a good idea. > > Bootstrapped and tested on x86_64-linux-gnu with no regressions.
In the places you add to the worklist in vectorizable_* can you please see to do that in a place where we could actually remove the stmt (and release the def)? Please also add a (inline) function like vect_remove_scalar_stmt (vinfo *, X) with X either a stmt_vec_info (preferred) or a gimple *. Thanks, Richard. > PR tree-optimization/116711 > > gcc/ChangeLog: > > * tree-ssa-dce.cc (simple_dce_from_worklist): Returns > true if something was removed. > * tree-ssa-dce.h (simple_dce_from_worklist): Change return > type to bool. > * tree-vect-loop.cc (vectorizable_induction): Add phi result > to the dce worklist. > * tree-vect-slp.cc: Add includes of tree-ssa-dce.h, > tree-ssa-loop-niter.h and tree-scalar-evolution.h. > (vect_slp_region): Add DCE_WORKLIST argument. Copy > the dce_worklist from the bb vectorization info. > (vect_slp_bbs): Add DCE_WORKLIST argument. Update call to > vect_slp_region. > (vect_slp_if_converted_bb): Add DCE_WORKLIST argument. Update > call to vect_slp_bbs. > (vect_slp_function): Update call to vect_slp_bbs and call > simple_dce_from_worklist. Also free the loop iteration and > scev cache if something was removed. > * tree-vect-stmts.cc (vectorizable_bswap): Add the lhs of the scalar > stmt > to the dce work list. > (vectorizable_call): Likewise. > (vectorizable_simd_clone_call): Likewise. > (vectorizable_conversion): Likewise. > (vectorizable_assignment): Likewise. > (vectorizable_shift): Likewise. > (vectorizable_operation): Likewise. > (vectorizable_condition): Likewise. > (vectorizable_comparison_1): Likewise. > * tree-vectorizer.cc: Include tree-ssa-dce.h. > (vec_info::remove_stmt): Add all of the uses of the store to the > dce work list. > (try_vectorize_loop_1): Update call to vect_slp_if_converted_bb. > Copy the dce worklist into the loop's vectinfo dce worklist. > (pass_vectorize::execute): Copy loops' vectinfo dce worklist locally. > Add call to simple_dce_from_worklist. > * tree-vectorizer.h (vec_info): Add dce_worklist field. > (vect_slp_if_converted_bb): Add bitmap argument. > * tree-vectorizer.h (vect_slp_if_converted_bb): Add bitmap argument. > > gcc/testsuite/ChangeLog: > > * gcc.dg/vect/bb-slp-77.c: New test. > > Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> > --- > gcc/testsuite/gcc.dg/vect/bb-slp-77.c | 15 +++++++++++++ > gcc/tree-ssa-dce.cc | 5 +++-- > gcc/tree-ssa-dce.h | 2 +- > gcc/tree-vect-loop.cc | 3 +++ > gcc/tree-vect-slp.cc | 32 ++++++++++++++++++++------- > gcc/tree-vect-stmts.cc | 16 +++++++++++++- > gcc/tree-vectorizer.cc | 21 +++++++++++++++++- > gcc/tree-vectorizer.h | 5 ++++- > 8 files changed, 85 insertions(+), 14 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-77.c > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > new file mode 100644 > index 00000000000..a74bb17e25c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > + > +/* Make sure SLP vectorization updates the estimated loop bounds correctly. > */ > + > +void g(int); > +void f(int *a) > +{ > + int n = a[0]++; > + int g1 = a[1]++; > + for(int i = 0; i < n; i++) > + g(g1); > +} > + > +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" { > target vect_int } } } */ > + > diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc > index 69249c73013..87c5df4216b 100644 > --- a/gcc/tree-ssa-dce.cc > +++ b/gcc/tree-ssa-dce.cc > @@ -2170,9 +2170,9 @@ make_pass_cd_dce (gcc::context *ctxt) > /* A cheap DCE interface. WORKLIST is a list of possibly dead stmts and > is consumed by this function. The function has linear complexity in > the number of dead stmts with a constant factor like the average SSA > - use operands number. */ > + use operands number. Returns true if something was removed. */ > > -void > +bool > simple_dce_from_worklist (bitmap worklist, bitmap need_eh_cleanup) > { > int phiremoved = 0; > @@ -2269,4 +2269,5 @@ simple_dce_from_worklist (bitmap worklist, bitmap > need_eh_cleanup) > phiremoved); > statistics_counter_event (cfun, "Statements removed", > stmtremoved); > + return phiremoved != 0 || stmtremoved != 0; > } > diff --git a/gcc/tree-ssa-dce.h b/gcc/tree-ssa-dce.h > index b0e92a58ea8..5c46f67b184 100644 > --- a/gcc/tree-ssa-dce.h > +++ b/gcc/tree-ssa-dce.h > @@ -18,5 +18,5 @@ along with GCC; see the file COPYING3. If not see > > #ifndef TREE_SSA_DCE_H > #define TREE_SSA_DCE_H > -extern void simple_dce_from_worklist (bitmap, bitmap = nullptr); > +extern bool simple_dce_from_worklist (bitmap, bitmap = nullptr); > #endif > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index 62c7f90779f..e99a9e932b2 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -10432,6 +10432,9 @@ vectorizable_induction (loop_vec_info loop_vinfo, > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n"); > > + /* Mark the induction phi for maybe removal. */ > + bitmap_set_bit (loop_vinfo->dce_worklist, SSA_NAME_VERSION > (gimple_phi_result (phi))); > + > pe = loop_preheader_edge (iv_loop); > /* Find the first insertion point in the BB. */ > basic_block bb = gimple_bb (phi); > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > index 4fcb9e2fa2b..3ad8767e0bb 100644 > --- a/gcc/tree-vect-slp.cc > +++ b/gcc/tree-vect-slp.cc > @@ -53,6 +53,9 @@ along with GCC; see the file COPYING3. If not see > #include "alloc-pool.h" > #include "sreal.h" > #include "predict.h" > +#include "tree-ssa-dce.h" > +#include "tree-ssa-loop-niter.h" > +#include "tree-scalar-evolution.h" > > static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree, > load_permutation_t &, > @@ -9009,7 +9012,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int > n_stmts, bool &fatal, > static bool > vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > vec<int> *dataref_groups, unsigned int n_stmts, > - loop_p orig_loop) > + loop_p orig_loop, bitmap dce_worklist) > { > bb_vec_info bb_vinfo; > auto_vector_modes vector_modes; > @@ -9175,6 +9178,8 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > mode_i += 1; > } > > + /* Copy the dce info. */ > + bitmap_ior_into (dce_worklist, bb_vinfo->dce_worklist); > delete bb_vinfo; > > if (mode_i < vector_modes.length () > @@ -9217,7 +9222,8 @@ vect_slp_region (vec<basic_block> bbs, > vec<data_reference_p> datarefs, > true if anything in the basic-block was vectorized. */ > > static bool > -vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) > +vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop, > + bitmap dce_worklist) > { > vec<data_reference_p> datarefs = vNULL; > auto_vec<int> dataref_groups; > @@ -9247,7 +9253,8 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p > orig_loop) > ++current_group; > } > > - return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop); > + return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop, > + dce_worklist); > } > > /* Special entry for the BB vectorizer. Analyze and transform a single > @@ -9256,11 +9263,12 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p > orig_loop) > vectorized. */ > > bool > -vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) > +vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop, > + bitmap dce_worklist) > { > auto_vec<basic_block> bbs; > bbs.safe_push (bb); > - return vect_slp_bbs (bbs, orig_loop); > + return vect_slp_bbs (bbs, orig_loop, dce_worklist); > } > > /* Main entry for the BB vectorizer. Analyze and transform BB, returns > @@ -9269,6 +9277,7 @@ vect_slp_if_converted_bb (basic_block bb, loop_p > orig_loop) > bool > vect_slp_function (function *fun) > { > + auto_bitmap dce_worklist; > bool r = false; > int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun)); > auto_bitmap exit_bbs; > @@ -9326,7 +9335,7 @@ vect_slp_function (function *fun) > > if (split && !bbs.is_empty ()) > { > - r |= vect_slp_bbs (bbs, NULL); > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > bbs.truncate (0); > } > > @@ -9363,15 +9372,22 @@ vect_slp_function (function *fun) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "splitting region at control altering " > "definition %G", last); > - r |= vect_slp_bbs (bbs, NULL); > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > bbs.truncate (0); > } > } > > if (!bbs.is_empty ()) > - r |= vect_slp_bbs (bbs, NULL); > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > > free (rpo); > + if (simple_dce_from_worklist (dce_worklist)) > + { > + /* If DCE removed something, we need to invalidate the caches. */ > + free_numbers_of_iterations_estimates (cfun); > + if (scev_initialized_p ()) > + scev_reset (); > + } > > return r; > } > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index b1353c91fce..ea6a2c66b59 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -3104,6 +3104,8 @@ vectorizable_bswap (vec_info *vinfo, > tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices); > > /* Transform. */ > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (gimple_call_lhs > (stmt))); > + > vec<tree> vec_oprnds = vNULL; > vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, > op, &vec_oprnds); > @@ -3496,6 +3498,7 @@ vectorizable_call (vec_info *vinfo, > > /* Handle def. */ > scalar_dest = gimple_call_lhs (stmt); > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > vec_dest = vect_create_destination_var (scalar_dest, vectype_out); > > bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo); > @@ -4404,6 +4407,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, > stmt_vec_info stmt_info, > ratype = NULL_TREE; > if (scalar_dest) > { > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > vec_dest = vect_create_destination_var (scalar_dest, vectype); > rtype = TREE_TYPE (TREE_TYPE (fndecl)); > if (TREE_CODE (rtype) == ARRAY_TYPE) > @@ -5644,6 +5648,8 @@ vectorizable_conversion (vec_info *vinfo, > op1 = fold_convert (TREE_TYPE (op0), op1); > } > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > + > /* In case of multi-step conversion, we first generate conversion > operations > to the intermediate types, and then from that types to the final one. > We create vector destinations for the intermediate type (TYPES) received > @@ -6006,6 +6012,8 @@ vectorizable_assignment (vec_info *vinfo, > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n"); > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > + > /* Handle def. */ > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > @@ -6400,6 +6408,7 @@ vectorizable_shift (vec_info *vinfo, > TREE_TYPE (vectype), NULL); > } > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > /* Handle def. */ > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > @@ -6872,6 +6881,7 @@ vectorizable_operation (vec_info *vinfo, > } > > /* Transform. */ > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > if (dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > @@ -12393,6 +12403,7 @@ vectorizable_condition (vec_info *vinfo, > scalar_dest = gimple_assign_lhs (stmt); > if (reduction_type != EXTRACT_LAST_REDUCTION) > vec_dest = vect_create_destination_var (scalar_dest, vectype); > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > bool swap_cond_operands = false; > > @@ -12829,7 +12840,10 @@ vectorizable_comparison_1 (vec_info *vinfo, tree > vectype, > /* Handle def. */ > lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info)); > if (lhs) > - mask = vect_create_destination_var (lhs, mask_type); > + { > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (lhs)); > + mask = vect_create_destination_var (lhs, mask_type); > + } > > vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, > rhs1, vectype, &vec_oprnds0, > diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc > index 4279b6db4cf..15a0fe26b13 100644 > --- a/gcc/tree-vectorizer.cc > +++ b/gcc/tree-vectorizer.cc > @@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see > #include "internal-fn.h" > #include "tree-ssa-sccvn.h" > #include "tree-into-ssa.h" > +#include "tree-ssa-dce.h" > > /* Loop or bb location, with hotness information. */ > dump_user_location_t vect_location; > @@ -620,6 +621,16 @@ vec_info::remove_stmt (stmt_vec_info stmt_info) > gcc_assert (!stmt_info->pattern_stmt_p); > set_vinfo_for_stmt (stmt_info->stmt, NULL); > unlink_stmt_vdef (stmt_info->stmt); > + ssa_op_iter iter; > + use_operand_p use_p; > + /* Mark all of the uses from the store as possible dceing. */ > + FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_info->stmt, iter, SSA_OP_USE) > + { > + tree use = USE_FROM_PTR (use_p); > + if (TREE_CODE (use) == SSA_NAME > + && ! SSA_NAME_IS_DEFAULT_DEF (use)) > + bitmap_set_bit (dce_worklist, SSA_NAME_VERSION (use)); > + } > gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt); > gsi_remove (&si, true); > release_defs (stmt_info->stmt); > @@ -1121,13 +1132,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> > *&simduid_to_vf_htab, > } > if (!require_loop_vectorize) > { > + auto_bitmap dce_worklist; > tree arg = gimple_call_arg (loop_vectorized_call, 1); > class loop *scalar_loop = get_loop (fun, tree_to_shwi (arg)); > - if (vect_slp_if_converted_bb (bb, scalar_loop)) > + if (vect_slp_if_converted_bb (bb, scalar_loop, dce_worklist)) > { > fold_loop_internal_call (loop_vectorized_call, > boolean_true_node); > loop_vectorized_call = NULL; > + /* Copy the DCE info if we have a loop vinfo around. */ > + if (loop_vinfo) > + bitmap_ior_into (loop_vinfo->dce_worklist, dce_worklist); > ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; > } > } > @@ -1236,6 +1251,7 @@ pass_vectorize::execute (function *fun) > hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL; > bool any_ifcvt_loops = false; > unsigned ret = 0; > + auto_bitmap dce_worklist; > > vect_loops_num = number_of_loops (fun); > > @@ -1374,6 +1390,8 @@ pass_vectorize::execute (function *fun) > continue; > loop_vinfo = (loop_vec_info) loop->aux; > has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo); > + /* Copy the dce info. */ > + bitmap_ior_into (dce_worklist, loop_vinfo->dce_worklist); > delete loop_vinfo; > if (has_mask_store > && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE)) > @@ -1395,6 +1413,7 @@ pass_vectorize::execute (function *fun) > } > > vect_slp_fini (); > + simple_dce_from_worklist (dce_worklist); > > return ret; > } > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 699ae9e33ba..573ec9da4b2 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -512,6 +512,9 @@ public: > /* The count of the basic blocks in the vectorization region. */ > unsigned int nbbs; > > + /* Bitmap for dce_worklist */ > + auto_bitmap dce_worklist; > + > private: > stmt_vec_info new_stmt_vec_info (gimple *stmt); > void set_vinfo_for_stmt (gimple *, stmt_vec_info, bool = true); > @@ -2546,7 +2549,7 @@ extern void vect_gather_slp_loads (vec_info *); > extern void vect_get_slp_defs (slp_tree, vec<tree> *); > extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *, > unsigned n = -1U); > -extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop); > +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop, > bitmap); > extern bool vect_slp_function (function *); > extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); > extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree); > -- > 2.43.0 >