On Tue, Sep 17, 2024 at 4:36 AM Andrew Pinski <quic_apin...@quicinc.com> wrote:
>
> This adds simple_dce_worklist to both the SLP vectorizer and the loop based 
> vectorizer.
> This is a step into removing the dce after the loop based vectorizer. That 
> DCE still
> does a few things, removing some of the induction variables which has become 
> unused. That is
> something which can be improved afterwards.
>
> Note this adds it to the SLP BB vectorizer too as it is used from the loop 
> based one sometimes.
> In the case of the BB SLP vectorizer, the dead statements don't get removed 
> until much later in
> DSE so removing them much earlier is important.
>
> Note on the new testcase, it came up during bootstrap where the SLP pass 
> would cause the need to
> invalidate the scev caches but there was no testcase for this beforehand so 
> adding one is a good idea.
>
> Bootstrapped and tested on x86_64-linux-gnu with no regressions.

In the places you add to the worklist in vectorizable_* can you please
see to do that in a place
where we could actually remove the stmt (and release the def)?  Please
also add a
(inline) function like vect_remove_scalar_stmt (vinfo *, X) with X
either a stmt_vec_info (preferred)
or a gimple *.

Thanks,
Richard.

>         PR tree-optimization/116711
>
> gcc/ChangeLog:
>
>         * tree-ssa-dce.cc (simple_dce_from_worklist): Returns
>         true if something was removed.
>         * tree-ssa-dce.h (simple_dce_from_worklist): Change return
>         type to bool.
>         * tree-vect-loop.cc (vectorizable_induction): Add phi result
>         to the dce worklist.
>         * tree-vect-slp.cc: Add includes of tree-ssa-dce.h,
>         tree-ssa-loop-niter.h and tree-scalar-evolution.h.
>         (vect_slp_region): Add DCE_WORKLIST argument. Copy
>         the dce_worklist from the bb vectorization info.
>         (vect_slp_bbs): Add DCE_WORKLIST argument. Update call to
>         vect_slp_region.
>         (vect_slp_if_converted_bb): Add DCE_WORKLIST argument. Update
>         call to vect_slp_bbs.
>         (vect_slp_function): Update call to vect_slp_bbs and call
>         simple_dce_from_worklist. Also free the loop iteration and
>         scev cache if something was removed.
>         * tree-vect-stmts.cc (vectorizable_bswap): Add the lhs of the scalar 
> stmt
>         to the dce work list.
>         (vectorizable_call): Likewise.
>         (vectorizable_simd_clone_call): Likewise.
>         (vectorizable_conversion): Likewise.
>         (vectorizable_assignment): Likewise.
>         (vectorizable_shift): Likewise.
>         (vectorizable_operation): Likewise.
>         (vectorizable_condition): Likewise.
>         (vectorizable_comparison_1): Likewise.
>         * tree-vectorizer.cc: Include tree-ssa-dce.h.
>         (vec_info::remove_stmt): Add all of the uses of the store to the
>         dce work list.
>         (try_vectorize_loop_1): Update call to vect_slp_if_converted_bb.
>         Copy the dce worklist into the loop's vectinfo dce worklist.
>         (pass_vectorize::execute): Copy loops' vectinfo dce worklist locally.
>         Add call to simple_dce_from_worklist.
>         * tree-vectorizer.h (vec_info): Add dce_worklist field.
>         (vect_slp_if_converted_bb): Add bitmap argument.
>         * tree-vectorizer.h (vect_slp_if_converted_bb): Add bitmap argument.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/vect/bb-slp-77.c: New test.
>
> Signed-off-by: Andrew Pinski <quic_apin...@quicinc.com>
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-77.c | 15 +++++++++++++
>  gcc/tree-ssa-dce.cc                   |  5 +++--
>  gcc/tree-ssa-dce.h                    |  2 +-
>  gcc/tree-vect-loop.cc                 |  3 +++
>  gcc/tree-vect-slp.cc                  | 32 ++++++++++++++++++++-------
>  gcc/tree-vect-stmts.cc                | 16 +++++++++++++-
>  gcc/tree-vectorizer.cc                | 21 +++++++++++++++++-
>  gcc/tree-vectorizer.h                 |  5 ++++-
>  8 files changed, 85 insertions(+), 14 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-77.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-77.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c
> new file mode 100644
> index 00000000000..a74bb17e25c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-77.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +
> +/* Make sure SLP vectorization updates the estimated loop bounds correctly. 
> */
> +
> +void g(int);
> +void f(int *a)
> +{
> +  int n = a[0]++;
> +  int g1 = a[1]++;
> +  for(int i = 0; i < n; i++)
> +    g(g1);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: basic block" 1 "slp1" { 
> target vect_int } } } */
> +
> diff --git a/gcc/tree-ssa-dce.cc b/gcc/tree-ssa-dce.cc
> index 69249c73013..87c5df4216b 100644
> --- a/gcc/tree-ssa-dce.cc
> +++ b/gcc/tree-ssa-dce.cc
> @@ -2170,9 +2170,9 @@ make_pass_cd_dce (gcc::context *ctxt)
>  /* A cheap DCE interface.  WORKLIST is a list of possibly dead stmts and
>     is consumed by this function.  The function has linear complexity in
>     the number of dead stmts with a constant factor like the average SSA
> -   use operands number.  */
> +   use operands number. Returns true if something was removed.  */
>
> -void
> +bool
>  simple_dce_from_worklist (bitmap worklist, bitmap need_eh_cleanup)
>  {
>    int phiremoved = 0;
> @@ -2269,4 +2269,5 @@ simple_dce_from_worklist (bitmap worklist, bitmap 
> need_eh_cleanup)
>                             phiremoved);
>    statistics_counter_event (cfun, "Statements removed",
>                             stmtremoved);
> +  return phiremoved != 0 || stmtremoved != 0;
>  }
> diff --git a/gcc/tree-ssa-dce.h b/gcc/tree-ssa-dce.h
> index b0e92a58ea8..5c46f67b184 100644
> --- a/gcc/tree-ssa-dce.h
> +++ b/gcc/tree-ssa-dce.h
> @@ -18,5 +18,5 @@ along with GCC; see the file COPYING3.  If not see
>
>  #ifndef TREE_SSA_DCE_H
>  #define TREE_SSA_DCE_H
> -extern void simple_dce_from_worklist (bitmap, bitmap = nullptr);
> +extern bool simple_dce_from_worklist (bitmap, bitmap = nullptr);
>  #endif
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 62c7f90779f..e99a9e932b2 100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -10432,6 +10432,9 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
>
> +  /* Mark the induction phi for maybe removal.  */
> +  bitmap_set_bit (loop_vinfo->dce_worklist, SSA_NAME_VERSION 
> (gimple_phi_result (phi)));
> +
>    pe = loop_preheader_edge (iv_loop);
>    /* Find the first insertion point in the BB.  */
>    basic_block bb = gimple_bb (phi);
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 4fcb9e2fa2b..3ad8767e0bb 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -53,6 +53,9 @@ along with GCC; see the file COPYING3.  If not see
>  #include "alloc-pool.h"
>  #include "sreal.h"
>  #include "predict.h"
> +#include "tree-ssa-dce.h"
> +#include "tree-ssa-loop-niter.h"
> +#include "tree-scalar-evolution.h"
>
>  static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
>                                             load_permutation_t &,
> @@ -9009,7 +9012,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int 
> n_stmts, bool &fatal,
>  static bool
>  vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
>                  vec<int> *dataref_groups, unsigned int n_stmts,
> -                loop_p orig_loop)
> +                loop_p orig_loop, bitmap dce_worklist)
>  {
>    bb_vec_info bb_vinfo;
>    auto_vector_modes vector_modes;
> @@ -9175,6 +9178,8 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>             mode_i += 1;
>           }
>
> +      /* Copy the dce info. */
> +      bitmap_ior_into (dce_worklist, bb_vinfo->dce_worklist);
>        delete bb_vinfo;
>
>        if (mode_i < vector_modes.length ()
> @@ -9217,7 +9222,8 @@ vect_slp_region (vec<basic_block> bbs, 
> vec<data_reference_p> datarefs,
>     true if anything in the basic-block was vectorized.  */
>
>  static bool
> -vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
> +vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop,
> +             bitmap dce_worklist)
>  {
>    vec<data_reference_p> datarefs = vNULL;
>    auto_vec<int> dataref_groups;
> @@ -9247,7 +9253,8 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p 
> orig_loop)
>        ++current_group;
>      }
>
> -  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
> +  return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop,
> +                         dce_worklist);
>  }
>
>  /* Special entry for the BB vectorizer.  Analyze and transform a single
> @@ -9256,11 +9263,12 @@ vect_slp_bbs (const vec<basic_block> &bbs, loop_p 
> orig_loop)
>     vectorized.  */
>
>  bool
> -vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
> +vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop,
> +                         bitmap dce_worklist)
>  {
>    auto_vec<basic_block> bbs;
>    bbs.safe_push (bb);
> -  return vect_slp_bbs (bbs, orig_loop);
> +  return vect_slp_bbs (bbs, orig_loop, dce_worklist);
>  }
>
>  /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
> @@ -9269,6 +9277,7 @@ vect_slp_if_converted_bb (basic_block bb, loop_p 
> orig_loop)
>  bool
>  vect_slp_function (function *fun)
>  {
> +  auto_bitmap dce_worklist;
>    bool r = false;
>    int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
>    auto_bitmap exit_bbs;
> @@ -9326,7 +9335,7 @@ vect_slp_function (function *fun)
>
>        if (split && !bbs.is_empty ())
>         {
> -         r |= vect_slp_bbs (bbs, NULL);
> +         r |= vect_slp_bbs (bbs, NULL, dce_worklist);
>           bbs.truncate (0);
>         }
>
> @@ -9363,15 +9372,22 @@ vect_slp_function (function *fun)
>               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                                "splitting region at control altering "
>                                "definition %G", last);
> -           r |= vect_slp_bbs (bbs, NULL);
> +           r |= vect_slp_bbs (bbs, NULL, dce_worklist);
>             bbs.truncate (0);
>           }
>      }
>
>    if (!bbs.is_empty ())
> -    r |= vect_slp_bbs (bbs, NULL);
> +    r |= vect_slp_bbs (bbs, NULL, dce_worklist);
>
>    free (rpo);
> +  if (simple_dce_from_worklist (dce_worklist))
> +    {
> +      /* If DCE removed something, we need to invalidate the caches.  */
> +      free_numbers_of_iterations_estimates (cfun);
> +      if (scev_initialized_p ())
> +       scev_reset ();
> +    }
>
>    return r;
>  }
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index b1353c91fce..ea6a2c66b59 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -3104,6 +3104,8 @@ vectorizable_bswap (vec_info *vinfo,
>    tree bswap_vconst = vec_perm_indices_to_tree (char_vectype, indices);
>
>    /* Transform.  */
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (gimple_call_lhs 
> (stmt)));
> +
>    vec<tree> vec_oprnds = vNULL;
>    vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
>                      op, &vec_oprnds);
> @@ -3496,6 +3498,7 @@ vectorizable_call (vec_info *vinfo,
>
>    /* Handle def.  */
>    scalar_dest = gimple_call_lhs (stmt);
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
>    vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
>
>    bool masked_loop_p = loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
> @@ -4404,6 +4407,7 @@ vectorizable_simd_clone_call (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>    ratype = NULL_TREE;
>    if (scalar_dest)
>      {
> +      bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
>        vec_dest = vect_create_destination_var (scalar_dest, vectype);
>        rtype = TREE_TYPE (TREE_TYPE (fndecl));
>        if (TREE_CODE (rtype) == ARRAY_TYPE)
> @@ -5644,6 +5648,8 @@ vectorizable_conversion (vec_info *vinfo,
>         op1 = fold_convert (TREE_TYPE (op0), op1);
>      }
>
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
> +
>    /* In case of multi-step conversion, we first generate conversion 
> operations
>       to the intermediate types, and then from that types to the final one.
>       We create vector destinations for the intermediate type (TYPES) received
> @@ -6006,6 +6012,8 @@ vectorizable_assignment (vec_info *vinfo,
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location, "transform assignment.\n");
>
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
> +
>    /* Handle def.  */
>    vec_dest = vect_create_destination_var (scalar_dest, vectype);
>
> @@ -6400,6 +6408,7 @@ vectorizable_shift (vec_info *vinfo,
>                                 TREE_TYPE (vectype), NULL);
>      }
>
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
>    /* Handle def.  */
>    vec_dest = vect_create_destination_var (scalar_dest, vectype);
>
> @@ -6872,6 +6881,7 @@ vectorizable_operation (vec_info *vinfo,
>      }
>
>    /* Transform.  */
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
>
>    if (dump_enabled_p ())
>      dump_printf_loc (MSG_NOTE, vect_location,
> @@ -12393,6 +12403,7 @@ vectorizable_condition (vec_info *vinfo,
>    scalar_dest = gimple_assign_lhs (stmt);
>    if (reduction_type != EXTRACT_LAST_REDUCTION)
>      vec_dest = vect_create_destination_var (scalar_dest, vectype);
> +  bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (scalar_dest));
>
>    bool swap_cond_operands = false;
>
> @@ -12829,7 +12840,10 @@ vectorizable_comparison_1 (vec_info *vinfo, tree 
> vectype,
>    /* Handle def.  */
>    lhs = gimple_get_lhs (STMT_VINFO_STMT (stmt_info));
>    if (lhs)
> -    mask = vect_create_destination_var (lhs, mask_type);
> +    {
> +      bitmap_set_bit (vinfo->dce_worklist, SSA_NAME_VERSION (lhs));
> +      mask = vect_create_destination_var (lhs, mask_type);
> +    }
>
>    vect_get_vec_defs (vinfo, stmt_info, slp_node, ncopies,
>                      rhs1, vectype, &vec_oprnds0,
> diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
> index 4279b6db4cf..15a0fe26b13 100644
> --- a/gcc/tree-vectorizer.cc
> +++ b/gcc/tree-vectorizer.cc
> @@ -84,6 +84,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "internal-fn.h"
>  #include "tree-ssa-sccvn.h"
>  #include "tree-into-ssa.h"
> +#include "tree-ssa-dce.h"
>
>  /* Loop or bb location, with hotness information.  */
>  dump_user_location_t vect_location;
> @@ -620,6 +621,16 @@ vec_info::remove_stmt (stmt_vec_info stmt_info)
>    gcc_assert (!stmt_info->pattern_stmt_p);
>    set_vinfo_for_stmt (stmt_info->stmt, NULL);
>    unlink_stmt_vdef (stmt_info->stmt);
> +  ssa_op_iter iter;
> +  use_operand_p use_p;
> +  /* Mark all of the uses from the store as possible dceing. */
> +  FOR_EACH_PHI_OR_STMT_USE (use_p, stmt_info->stmt, iter, SSA_OP_USE)
> +    {
> +      tree use = USE_FROM_PTR (use_p);
> +      if (TREE_CODE (use) == SSA_NAME
> +         && ! SSA_NAME_IS_DEFAULT_DEF (use))
> +       bitmap_set_bit (dce_worklist, SSA_NAME_VERSION (use));
> +    }
>    gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt);
>    gsi_remove (&si, true);
>    release_defs (stmt_info->stmt);
> @@ -1121,13 +1132,17 @@ try_vectorize_loop_1 (hash_table<simduid_to_vf> 
> *&simduid_to_vf_htab,
>             }
>           if (!require_loop_vectorize)
>             {
> +             auto_bitmap dce_worklist;
>               tree arg = gimple_call_arg (loop_vectorized_call, 1);
>               class loop *scalar_loop = get_loop (fun, tree_to_shwi (arg));
> -             if (vect_slp_if_converted_bb (bb, scalar_loop))
> +             if (vect_slp_if_converted_bb (bb, scalar_loop, dce_worklist))
>                 {
>                   fold_loop_internal_call (loop_vectorized_call,
>                                            boolean_true_node);
>                   loop_vectorized_call = NULL;
> +                 /* Copy the DCE info if we have a loop vinfo around. */
> +                 if (loop_vinfo)
> +                   bitmap_ior_into (loop_vinfo->dce_worklist, dce_worklist);
>                   ret |= TODO_cleanup_cfg | TODO_update_ssa_only_virtuals;
>                 }
>             }
> @@ -1236,6 +1251,7 @@ pass_vectorize::execute (function *fun)
>    hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
>    bool any_ifcvt_loops = false;
>    unsigned ret = 0;
> +  auto_bitmap dce_worklist;
>
>    vect_loops_num = number_of_loops (fun);
>
> @@ -1374,6 +1390,8 @@ pass_vectorize::execute (function *fun)
>         continue;
>        loop_vinfo = (loop_vec_info) loop->aux;
>        has_mask_store = LOOP_VINFO_HAS_MASK_STORE (loop_vinfo);
> +      /* Copy the dce info. */
> +      bitmap_ior_into (dce_worklist, loop_vinfo->dce_worklist);
>        delete loop_vinfo;
>        if (has_mask_store
>           && targetm.vectorize.empty_mask_is_expensive (IFN_MASK_STORE))
> @@ -1395,6 +1413,7 @@ pass_vectorize::execute (function *fun)
>      }
>
>    vect_slp_fini ();
> +  simple_dce_from_worklist (dce_worklist);
>
>    return ret;
>  }
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 699ae9e33ba..573ec9da4b2 100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -512,6 +512,9 @@ public:
>    /* The count of the basic blocks in the vectorization region.  */
>    unsigned int nbbs;
>
> +  /* Bitmap for dce_worklist */
> +  auto_bitmap dce_worklist;
> +
>  private:
>    stmt_vec_info new_stmt_vec_info (gimple *stmt);
>    void set_vinfo_for_stmt (gimple *, stmt_vec_info, bool = true);
> @@ -2546,7 +2549,7 @@ extern void vect_gather_slp_loads (vec_info *);
>  extern void vect_get_slp_defs (slp_tree, vec<tree> *);
>  extern void vect_get_slp_defs (vec_info *, slp_tree, vec<vec<tree> > *,
>                                unsigned n = -1U);
> -extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop);
> +extern bool vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop, 
> bitmap);
>  extern bool vect_slp_function (function *);
>  extern stmt_vec_info vect_find_last_scalar_stmt_in_slp (slp_tree);
>  extern stmt_vec_info vect_find_first_scalar_stmt_in_slp (slp_tree);
> --
> 2.43.0
>

Reply via email to