On Tue, Sep 17, 2024 at 11:53 PM Richard Biener <richard.guent...@gmail.com> wrote: > > On Tue, Sep 17, 2024 at 4:36 AM Andrew Pinski <quic_apin...@quicinc.com> > wrote: > > > > This adds simple_dce_worklist to both the SLP vectorizer and the loop based > > vectorizer. > > This is a step into removing the dce after the loop based vectorizer. That > > DCE still > > does a few things, removing some of the induction variables which has > > become unused. That is > > something which can be improved afterwards. > > > > Note this adds it to the SLP BB vectorizer too as it is used from the loop > > based one sometimes. > > In the case of the BB SLP vectorizer, the dead statements don't get removed > > until much later in > > DSE so removing them much earlier is important. > > > > Note on the new testcase, it came up during bootstrap where the SLP pass > > would cause the need to > > invalidate the scev caches but there was no testcase for this beforehand so > > adding one is a good idea. > > > > Bootstrapped and tested on x86_64-linux-gnu with no regressions. > > In the places you add to the worklist in vectorizable_* can you please > see to do that in a place > where we could actually remove the stmt (and release the def)? Please > also add a > (inline) function like vect_remove_scalar_stmt (vinfo *, X) with X > either a stmt_vec_info (preferred) > or a gimple *.
I was thinking about this and the only place where I know 100% that we might be removing the statement is `vec_info::remove_stmt` which also might be just enough to remove all of the scalar cases. Let me try removing the places which call bitmap_set_bit except for that one and report back. Though induction variables might still need to be removed too; I have to dig into that. Thanks, Andrew > > Thanks, > Richard. > > > PR tree-optimization/116711 > > > > gcc/ChangeLog: > > > > * tree-ssa-dce.cc (simple_dce_from_worklist): Returns > > true if something was removed. > > * tree-ssa-dce.h (simple_dce_from_worklist): Change return > > type to bool. > > * tree-vect-loop.cc (vectorizable_induction): Add phi result > > to the dce worklist. > > * tree-vect-slp.cc: Add includes of tree-ssa-dce.h, > > tree-ssa-loop-niter.h and tree-scalar-evolution.h. > > (vect_slp_region): Add DCE_WORKLIST argument. Copy > > the dce_worklist from the bb vectorization info. > > (vect_slp_bbs): Add DCE_WORKLIST argument. Update call to > > vect_slp_region. > > (vect_slp_if_converted_bb): Add DCE_WORKLIST argument. Update > > call to vect_slp_bbs. > > (vect_slp_function): Update call to vect_slp_bbs and call > > simple_dce_from_worklist. Also free the loop iteration and > > scev cache if something was removed. > > * tree-vect-stmts.cc (vectorizable_bswap): Add the lhs of the > > scalar stmt > > to the dce work list. > > (vectorizable_call): Likewise. > > (vectorizable_simd_clone_call): Likewise. > > (vectorizable_conversion): Likewise. > > (vectorizable_assignment): Likewise. > > (vectorizable_shift): Likewise. > > (vectorizable_operation): Likewise. > > (vectorizable_condition): Likewise. > > (vectorizable_comparison_1): Likewise. > > * tree-vectorizer.cc: Include tree-ssa-dce.h. > > (vec_info::remove_stmt): Add all of the uses of the store to the > > dce work list. > > (try_vectorize_loop_1): Update call to vect_slp_if_converted_bb. > > Copy the dce worklist into the loop's vectinfo dce worklist. > > (pass_vectorize::execute): Copy loops' vectinfo dce worklist > > locally. > > Add call to simple_dce_from_worklist. > > * tree-vectorizer.h (vec_info): Add dce_worklist field. > > (vect_slp_if_converted_bb): Add bitmap argument. > > * tree-vectorizer.h (vect_slp_if_converted_bb): Add bitmap argument. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.dg/vect/bb-slp-77.c: New test. > > > > Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com> > > --- > > gcc/testsuite/gcc.dg/vect/bb-slp-77.c | 15 +++++++++++++ > > gcc/tree-ssa-dce.cc | 5 +++-- > > gcc/tree-ssa-dce.h | 2 +- > > gcc/tree-vect-loop.cc | 3 +++ > > gcc/tree-vect-slp.cc | 32 ++++++++++++++++++++------- > > gcc/tree-vect-stmts.cc | 16 +++++++++++++- > > gcc/tree-vectorizer.cc | 21 +++++++++++++++++- > > gcc/tree-vectorizer.h | 5 ++++- > > 8 files changed, 85 insertions(+), 14 deletions(-) > > create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-77.c > > > > diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > > b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > > new file mode 100644 > > index 00000000000..a74bb17e25c > > --- /dev/null > > +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c > > @@ -0,0 +1,15 @@ > > +/* { dg-do compile } */ > > + > > +/* Make sure SLP vectorization updates the estimated loop bounds > > correctly. */ > > + > > +void g(int); > > +void f(int *a) > > +{ > > + int n = a[0]++; > > + int g1 = a[1]++; > > + for(int i = 0; i < n; i++) > > + g(g1); > > +} > > + > > +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" { > > target vect_int } } } */ > > + > > diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc > > index 69249c73013..87c5df4216b 100644 > > --- a/gcc/tree-ssa-dce.cc > > +++ b/gcc/tree-ssa-dce.cc > > @@ -2170,9 +2170,9 @@ make_pass_cd_dce (gcc::context *ctxt) > > /* A cheap DCE interface. WORKLIST is a list of possibly dead stmts and > > is consumed by this function. The function has linear complexity in > > the number of dead stmts with a constant factor like the average SSA > > - use operands number. */ > > + use operands number. Returns true if something was removed. */ > > > > -void > > +bool > > simple_dce_from_worklist (bitmap worklist, bitmap need_eh_cleanup) > > { > > int phiremoved = 0; > > @@ -2269,4 +2269,5 @@ simple_dce_from_worklist (bitmap worklist, bitmap > > need_eh_cleanup) > > phiremoved); > > statistics_counter_event (cfun, "Statements removed", > > stmtremoved); > > + return phiremoved != 0 || stmtremoved != 0; > > } > > diff --git a/gcc/tree-ssa-dce.h b/gcc/tree-ssa-dce.h > > index b0e92a58ea8..5c46f67b184 100644 > > --- a/gcc/tree-ssa-dce.h > > +++ b/gcc/tree-ssa-dce.h > > @@ -18,5 +18,5 @@ along with GCC; see the file COPYING3. If not see > > > > #ifndef TREE_SSA_DCE_H > > #define TREE_SSA_DCE_H > > -extern void simple_dce_from_worklist (bitmap, bitmap = nullptr); > > +extern bool simple_dce_from_worklist (bitmap, bitmap = nullptr); > > #endif > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > > index 62c7f90779f..e99a9e932b2 100644 > > --- a/gcc/tree-vect-loop.cc > > +++ b/gcc/tree-vect-loop.cc > > @@ -10432,6 +10432,9 @@ vectorizable_induction (loop_vec_info loop_vinfo, > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_NOTE, vect_location, "transform induction > > phi.\n"); > > > > + /* Mark the induction phi for maybe removal. */ > > + bitmap_set_bit (loop_vinfo->dce_worklist, SSA_NAME_VERSION > > (gimple_phi_result (phi))); > > + > > pe = loop_preheader_edge (iv_loop); > > /* Find the first insertion point in the BB. */ > > basic_block bb = gimple_bb (phi); > > diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc > > index 4fcb9e2fa2b..3ad8767e0bb 100644 > > --- a/gcc/tree-vect-slp.cc > > +++ b/gcc/tree-vect-slp.cc > > @@ -53,6 +53,9 @@ along with GCC; see the file COPYING3. If not see > > #include "alloc-pool.h" > > #include "sreal.h" > > #include "predict.h" > > +#include "tree-ssa-dce.h" > > +#include "tree-ssa-loop-niter.h" > > +#include "tree-scalar-evolution.h" > > > > static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree, > > load_permutation_t &, > > @@ -9009,7 +9012,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int > > n_stmts, bool &fatal, > > static bool > > vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs, > > vec<int> *dataref_groups, unsigned int n_stmts, > > - loop_p orig_loop) > > + loop_p orig_loop, bitmap dce_worklist) > > { > > bb_vec_info bb_vinfo; > > auto_vector_modes vector_modes; > > @@ -9175,6 +9178,8 @@ vect_slp_region (vec<basic_block> bbs, > > vec<data_reference_p> datarefs, > > mode_i += 1; > > } > > > > + /* Copy the dce info. */ > > + bitmap_ior_into (dce_worklist, bb_vinfo->dce_worklist); > > delete bb_vinfo; > > > > if (mode_i < vector_modes.length () > > @@ -9217,7 +9222,8 @@ vect_slp_region (vec<basic_block> bbs, > > vec<data_reference_p> datarefs, > > true if anything in the basic-block was vectorized. */ > > > > static bool > > -vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop) > > +vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop, > > + bitmap dce_worklist) > > { > > vec<data_reference_p> datarefs = vNULL; > > auto_vec<int> dataref_groups; > > @@ -9247,7 +9253,8 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p > > orig_loop) > > ++current_group; > > } > > > > - return vect_slp_region (bbs, datarefs, &dataref_groups, insns, > > orig_loop); > > + return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop, > > + dce_worklist); > > } > > > > /* Special entry for the BB vectorizer. Analyze and transform a single > > @@ -9256,11 +9263,12 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p > > orig_loop) > > vectorized. */ > > > > bool > > -vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop) > > +vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop, > > + bitmap dce_worklist) > > { > > auto_vec<basic_block> bbs; > > bbs.safe_push (bb); > > - return vect_slp_bbs (bbs, orig_loop); > > + return vect_slp_bbs (bbs, orig_loop, dce_worklist); > > } > > > > /* Main entry for the BB vectorizer. Analyze and transform BB, returns > > @@ -9269,6 +9277,7 @@ vect_slp_if_converted_bb (basic_block bb, loop_p > > orig_loop) > > bool > > vect_slp_function (function *fun) > > { > > + auto_bitmap dce_worklist; > > bool r = false; > > int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun)); > > auto_bitmap exit_bbs; > > @@ -9326,7 +9335,7 @@ vect_slp_function (function *fun) > > > > if (split && !bbs.is_empty ()) > > { > > - r |= vect_slp_bbs (bbs, NULL); > > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > > bbs.truncate (0); > > } > > > > @@ -9363,15 +9372,22 @@ vect_slp_function (function *fun) > > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > "splitting region at control altering " > > "definition %G", last); > > - r |= vect_slp_bbs (bbs, NULL); > > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > > bbs.truncate (0); > > } > > } > > > > if (!bbs.is_empty ()) > > - r |= vect_slp_bbs (bbs, NULL); > > + r |= vect_slp_bbs (bbs, NULL, dce_worklist); > > > > free (rpo); > > + if (simple_dce_from_worklist (dce_worklist)) > > + { > > + /* If DCE removed something, we need to invalidate the caches. */ > > + free_numbers_of_iterations_estimates (cfun); > > + if (scev_initialized_p ()) > > + scev_reset (); > > + } > > > > return r; > > } > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > index b1353c91fce..ea6a2c66b59 100644 > > --- a/gcc/tree-vect-stmts.cc > > +++ b/gcc/tree-vect-stmts.cc > > @@ -3104,6 +3104,8 @@ vectorizable_bswap (vec_info *vinfo, > > tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices); > > > > /* Transform. */ > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (gimple_call_lhs > > (stmt))); > > + > > vec<tree> vec_oprnds = vNULL; > > vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, > > op, &vec_oprnds); > > @@ -3496,6 +3498,7 @@ vectorizable_call (vec_info *vinfo, > > > > /* Handle def. */ > > scalar_dest = gimple_call_lhs (stmt); > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > vec_dest = vect_create_destination_var (scalar_dest, vectype_out); > > > > bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P > > (loop_vinfo); > > @@ -4404,6 +4407,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, > > stmt_vec_info stmt_info, > > ratype = NULL_TREE; > > if (scalar_dest) > > { > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > rtype = TREE_TYPE (TREE_TYPE (fndecl)); > > if (TREE_CODE (rtype) == ARRAY_TYPE) > > @@ -5644,6 +5648,8 @@ vectorizable_conversion (vec_info *vinfo, > > op1 = fold_convert (TREE_TYPE (op0), op1); > > } > > > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > + > > /* In case of multi-step conversion, we first generate conversion > > operations > > to the intermediate types, and then from that types to the final one. > > We create vector destinations for the intermediate type (TYPES) > > received > > @@ -6006,6 +6012,8 @@ vectorizable_assignment (vec_info *vinfo, > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n"); > > > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > + > > /* Handle def. */ > > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > > > @@ -6400,6 +6408,7 @@ vectorizable_shift (vec_info *vinfo, > > TREE_TYPE (vectype), NULL); > > } > > > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > /* Handle def. */ > > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > > > @@ -6872,6 +6881,7 @@ vectorizable_operation (vec_info *vinfo, > > } > > > > /* Transform. */ > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > > > if (dump_enabled_p ()) > > dump_printf_loc (MSG_NOTE, vect_location, > > @@ -12393,6 +12403,7 @@ vectorizable_condition (vec_info *vinfo, > > scalar_dest = gimple_assign_lhs (stmt); > > if (reduction_type != EXTRACT_LAST_REDUCTION) > > vec_dest = vect_create_destination_var (scalar_dest, vectype); > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest)); > > > > bool swap_cond_operands = false; > > > > @@ -12829,7 +12840,10 @@ vectorizable_comparison_1 (vec_info *vinfo, tree > > vectype, > > /* Handle def. */ > > lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info)); > > if (lhs) > > - mask = vect_create_destination_var (lhs, mask_type); > > + { > > + bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (lhs)); > > + mask = vect_create_destination_var (lhs, mask_type); > > + } > > > > vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies, > > rhs1, vectype, &vec_oprnds0, > > diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc > > index 4279b6db4cf..15a0fe26b13 100644 > > --- a/gcc/tree-vectorizer.cc > > +++ b/gcc/tree-vectorizer.cc > > @@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see > > #include "internal-fn.h" > > #include "tree-ssa-sccvn.h" > > #include "tree-into-ssa.h" > > +#include "tree-ssa-dce.h" > > > > /* Loop or bb location, with hotness information. */ > > dump_user_location_t vect_location; > > @@ -620,6 +621,16 @@ vec_info::remove_stmt (stmt_vec_info stmt_info) > > gcc_assert (!stmt_info->pattern_stmt_p); > > set_vinfo_for_stmt (stmt_info->stmt, NULL); > > unlink_stmt_vdef (stmt_info->stmt); > > + ssa_op_iter iter; > > + use_operand_p use_p; > > + /* Mark all of the uses from the store as possible dceing. */ > > + FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_info->stmt, iter, SSA_OP_USE) > > + { > > + tree use = USE_FROM_PTR (use_p); > > + if (TREE_CODE (use) == SSA_NAME > > + && ! SSA_NAME_IS_DEFAULT_DEF (use)) > > + bitmap_set_bit (dce_worklist, SSA_NAME_VERSION (use)); > > + } > > gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt); > > gsi_remove (&si, true); > > release_defs (stmt_info->stmt); > > @@ -1121,13 +1132,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> > > *&simduid_to_vf_htab, > > } > > if (!require_loop_vectorize) > > { > > + auto_bitmap dce_worklist; > > tree arg = gimple_call_arg (loop_vectorized_call, 1); > > class loop *scalar_loop = get_loop (fun, tree_to_shwi (arg)); > > - if (vect_slp_if_converted_bb (bb, scalar_loop)) > > + if (vect_slp_if_converted_bb (bb, scalar_loop, dce_worklist)) > > { > > fold_loop_internal_call (loop_vectorized_call, > > boolean_true_node); > > loop_vectorized_call = NULL; > > + /* Copy the DCE info if we have a loop vinfo around. */ > > + if (loop_vinfo) > > + bitmap_ior_into (loop_vinfo->dce_worklist, > > dce_worklist); > > ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals; > > } > > } > > @@ -1236,6 +1251,7 @@ pass_vectorize::execute (function *fun) > > hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL; > > bool any_ifcvt_loops = false; > > unsigned ret = 0; > > + auto_bitmap dce_worklist; > > > > vect_loops_num = number_of_loops (fun); > > > > @@ -1374,6 +1390,8 @@ pass_vectorize::execute (function *fun) > > continue; > > loop_vinfo = (loop_vec_info) loop->aux; > > has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo); > > + /* Copy the dce info. */ > > + bitmap_ior_into (dce_worklist, loop_vinfo->dce_worklist); > > delete loop_vinfo; > > if (has_mask_store > > && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE)) > > @@ -1395,6 +1413,7 @@ pass_vectorize::execute (function *fun) > > } > > > > vect_slp_fini (); > > + simple_dce_from_worklist (dce_worklist); > > > > return ret; > > } > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > > index 699ae9e33ba..573ec9da4b2 100644 > > --- a/gcc/tree-vectorizer.h > > +++ b/gcc/tree-vectorizer.h > > @@ -512,6 +512,9 @@ public: > > /* The count of the basic blocks in the vectorization region. */ > > unsigned int nbbs; > > > > + /* Bitmap for dce_worklist */ > > + auto_bitmap dce_worklist; > > + > > private: > > stmt_vec_info new_stmt_vec_info (gimple *stmt); > > void set_vinfo_for_stmt (gimple *, stmt_vec_info, bool = true); > > @@ -2546,7 +2549,7 @@ extern void vect_gather_slp_loads (vec_info *); > > extern void vect_get_slp_defs (slp_tree, vec<tree> *); > > extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *, > > unsigned n = -1U); > > -extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop); > > +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop, > > bitmap); > > extern bool vect_slp_function (function *); > > extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree); > > extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree); > > -- > > 2.43.0 > >