On Wed, Nov 10, 2021 at 1:43 PM Jan Hubicka via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > Hi, > this patch implements DSE using modref summaries: if function has no side > effects > besides storing to memory pointed to by its argument and if we can prove > those stores > to be dead, we can optimize out. So we handle for example: > > volatile int *ptr; > struct a { > int a,b,c; > } a; > __attribute__((noinline)) > static int init (struct a*a) > { > a->a=0; > a->b=1; > } > __attribute__((noinline)) > static int use (struct a*a) > { > if (a->c != 3) > *ptr=5; > } > > void > main(void) > { > struct a a; > init (&a); > a.c=3; > use (&a); > } > > And optimize out call to init (&a). > > We work quite hard to inline such constructors and this patch is only > effective if inlining did not happen (for whatever reason). Still, we > optimize about 26 calls building tramp3d and about 70 calls during > bootstrap (mostly ctors of poly_int). During bootstrap most removal > happens early and we would inline the ctors unless we decide to optimize > for size. 1 call per cc1* binary is removed late during LTO build. > > This is more frequent in codebases with higher abstraction penalty, with > -Os or with profile feedback in sections optimized for size. I also hope > we will be able to CSE such calls and that would make DSE more > important. > > Bootstrapped/regtested x86_64-linux, OK? > > gcc/ChangeLog: > > * tree-ssa-alias.c (ao_ref_alias_ptr_type): Export.
ao_ref_init_from_ptr_and_range it is > * tree-ssa-alias.h (ao_ref_init_from_ptr_and_range): Declare. > * tree-ssa-dse.c (dse_optimize_stmt): Rename to ... > (dse_optimize_store): ... this; > (dse_optimize_call): New function. > (pass_dse::execute): Use dse_optimize_call and update > call to dse_optimize_store. > > gcc/testsuite/ChangeLog: > > * gcc.dg/tree-ssa/modref-dse-1.c: New test. > * gcc.dg/tree-ssa/modref-dse-2.c: New test. > > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-1.c > b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-1.c > new file mode 100644 > index 00000000000..e78693b349a > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-1.c > @@ -0,0 +1,28 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-dse1" } */ > +volatile int *ptr; > +struct a { > + int a,b,c; > +} a; > +__attribute__((noinline)) > +static int init (struct a*a) > +{ > + a->a=0; > + a->b=1; > +} > +__attribute__((noinline)) > +static int use (struct a*a) > +{ > + if (a->c != 3) > + *ptr=5; > +} > + > +void > +main(void) > +{ > + struct a a; > + init (&a); > + a.c=3; > + use (&a); > +} > +/* { dg-final { scan-tree-dump "Deleted dead store: init" "dse1" } } */ > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-2.c > b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-2.c > new file mode 100644 > index 00000000000..99c8ceb8127 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/modref-dse-2.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-dse2 -fno-ipa-sra -fno-ipa-cp" } */ > +volatile int *ptr; > +struct a { > + int a,b,c; > +} a; > +__attribute__((noinline)) > +static int init (struct a*a) > +{ > + a->a=0; > + a->b=1; > + a->c=1; > +} > +__attribute__((noinline)) > +static int use (struct a*a) > +{ > + if (a->c != 3) > + *ptr=5; > +} > + > +void > +main(void) > +{ > + struct a a; > + init (&a); > + a.c=3; > + use (&a); > +} > +/* Only DSE2 is tracking live bytes needed to figure out that store to c is > + also dead above. */ > +/* { dg-final { scan-tree-dump "Deleted dead store: init" "dse2" } } */ > diff --git a/gcc/tree-ssa-alias.c b/gcc/tree-ssa-alias.c > index eabf6805f2b..affb5d40d4b 100644 > --- a/gcc/tree-ssa-alias.c > +++ b/gcc/tree-ssa-alias.c > @@ -782,7 +782,7 @@ ao_ref_alias_ptr_type (ao_ref *ref) > The access is assumed to be only to or after of the pointer target > adjusted > by the offset, not before it (even in the case RANGE_KNOWN is false). */ > > -static void > +void > ao_ref_init_from_ptr_and_range (ao_ref *ref, tree ptr, > bool range_known, > poly_int64 offset, > diff --git a/gcc/tree-ssa-alias.h b/gcc/tree-ssa-alias.h > index 275dea10397..c2e28a74999 100644 > --- a/gcc/tree-ssa-alias.h > +++ b/gcc/tree-ssa-alias.h > @@ -111,6 +111,8 @@ ao_ref::max_size_known_p () const > /* In tree-ssa-alias.c */ > extern void ao_ref_init (ao_ref *, tree); > extern void ao_ref_init_from_ptr_and_size (ao_ref *, tree, tree); > +void ao_ref_init_from_ptr_and_range (ao_ref *, tree, bool, > + poly_int64, poly_int64, poly_int64); > extern tree ao_ref_base (ao_ref *); > extern alias_set_type ao_ref_alias_set (ao_ref *); > extern alias_set_type ao_ref_base_alias_set (ao_ref *); > diff --git a/gcc/tree-ssa-dse.c b/gcc/tree-ssa-dse.c > index 27287fe88ee..1fec9100011 100644 > --- a/gcc/tree-ssa-dse.c > +++ b/gcc/tree-ssa-dse.c > @@ -40,6 +40,9 @@ along with GCC; see the file COPYING3. If not see > #include "gimplify.h" > #include "tree-eh.h" > #include "cfganal.h" > +#include "cgraph.h" > +#include "ipa-modref-tree.h" > +#include "ipa-modref.h" > > /* This file implements dead store elimination. > > @@ -1039,7 +1042,8 @@ delete_dead_or_redundant_assignment > (gimple_stmt_iterator *gsi, const char *type > post dominates the first store, then the first store is dead. */ > > static void > -dse_optimize_stmt (function *fun, gimple_stmt_iterator *gsi, sbitmap > live_bytes) > +dse_optimize_store (function *fun, gimple_stmt_iterator *gsi, > + sbitmap live_bytes) > { > gimple *stmt = gsi_stmt (*gsi); > > @@ -1182,6 +1186,128 @@ dse_optimize_stmt (function *fun, > gimple_stmt_iterator *gsi, sbitmap live_bytes) > delete_dead_or_redundant_assignment (gsi, "dead", need_eh_cleanup); > } > > +/* Try to prove, using modref summary, that all memory written to by a call > is > + dead and remove it. */ > + > +static bool > +dse_optimize_call (gimple_stmt_iterator *gsi, sbitmap live_bytes) > +{ > + gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi)); > + tree callee = gimple_call_fndecl (stmt); > + > + if (!callee) > + return false; > + > + /* Pure/const functions are optimized by normal DCE > + or handled as store above. */ > + int flags = gimple_call_flags (stmt); > + if ((flags & (ECF_PURE|ECF_CONST|ECF_NOVOPS)) > + && !(flags & (ECF_LOOPING_CONST_OR_PURE))) > + return false; > > + cgraph_node *node = cgraph_node::get (callee); > + if (!node) > + return false; > + > + if (stmt_could_throw_p (cfun, stmt) > + && !cfun->can_delete_dead_exceptions) > + return false; > + > + /* If return value is used the call is not dead. */ > + tree lhs = gimple_call_lhs (stmt); > + if (lhs && TREE_CODE (lhs) == SSA_NAME) > + { > + imm_use_iterator ui; > + gimple *use_stmt; > + FOR_EACH_IMM_USE_STMT (use_stmt, ui, lhs) > + if (!is_gimple_debug (use_stmt)) > + return false; > + } > + > + /* Verify that there are no side-effects except for return value > + and memory writes tracked by modref. */ > + modref_summary *summary = get_modref_function_summary (node); > + if (!summary || summary->writes_errno > + || summary->side_effects > + || summary->global_memory_written_p ()) > + return false; > + > + modref_base_node <alias_set_type> *base_node; > + modref_ref_node <alias_set_type> *ref_node; > + modref_access_node *access_node; > + size_t i, j, k; > + bool by_clobber_p = false; > + int num_tests = 0, max_tests = param_modref_max_tests; > + > + /* Unlike alias oracle we can not skip subtrees based on TBAA check. > + Count the size of the whole tree to verify that we will not need too > many > + tests. */ > + FOR_EACH_VEC_SAFE_ELT (summary->stores->bases, i, base_node) > + FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node) > + FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node) > + if (num_tests++ > max_tests) > + return false; at least the innermost loop can be done as if (num_tests += ref_node->accesses.length () > max_tests) no? > + > + /* Walk all memory writes and verify that they are dead. */ > + FOR_EACH_VEC_SAFE_ELT (summary->stores->bases, i, base_node) > + FOR_EACH_VEC_SAFE_ELT (base_node->refs, j, ref_node) > + FOR_EACH_VEC_SAFE_ELT (ref_node->accesses, k, access_node) > + { > + /* ??? if offset is unkonwn it may be negative. Not sure > + how to construct ref here. */ I think you can't, you could use -poly_int64_max or so. > + if (!access_node->parm_offset_known) > + return false; But you could do this check in the loop computing num_tests ... (we could also cache the count and whether any of the refs have unknown offset in the summary?) > + tree arg; > + if (access_node->parm_index == MODREF_STATIC_CHAIN_PARM) > + arg = gimple_call_chain (stmt); > + else > + arg = gimple_call_arg (stmt, access_node->parm_index); > + > + ao_ref ref; > + poly_offset_int off = (poly_offset_int)access_node->offset > + + ((poly_offset_int)access_node->parm_offset > + << LOG2_BITS_PER_UNIT); > + poly_int64 off2; > + if (!off.to_shwi (&off2)) > + return false; > + ao_ref_init_from_ptr_and_range > + (&ref, arg, true, off2, access_node->size, > + access_node->max_size); > + ref.ref_alias_set = ref_node->ref; > + ref.base_alias_set = base_node->base; > + > + bool byte_tracking_enabled > + = setup_live_bytes_from_ref (&ref, live_bytes); > + enum dse_store_status store_status; > + > + store_status = dse_classify_store (&ref, stmt, > + byte_tracking_enabled, > + live_bytes, &by_clobber_p); > + if (store_status != DSE_STORE_DEAD) > + return false; > + } > + /* Check also value stored by the call. */ > + if (gimple_store_p (stmt)) > + { > + ao_ref ref; > + > + if (!initialize_ao_ref_for_dse (stmt, &ref)) > + gcc_unreachable (); > + bool byte_tracking_enabled > + = setup_live_bytes_from_ref (&ref, live_bytes); > + enum dse_store_status store_status; > + > + store_status = dse_classify_store (&ref, stmt, > + byte_tracking_enabled, > + live_bytes, &by_clobber_p); > + if (store_status != DSE_STORE_DEAD) > + return false; > + } > + delete_dead_or_redundant_assignment (gsi, "dead", need_eh_cleanup); > + return true; > +} > + > namespace { > > const pass_data pass_data_dse = > @@ -1235,7 +1363,14 @@ pass_dse::execute (function *fun) > gimple *stmt = gsi_stmt (gsi); > > if (gimple_vdef (stmt)) > - dse_optimize_stmt (fun, &gsi, live_bytes); > + { > + gcall *call = dyn_cast <gcall *> (stmt); > + > + if (call && dse_optimize_call (&gsi, live_bytes)) > + /* We removed a dead call. */; > + else > + dse_optimize_store (fun, &gsi, live_bytes); I think we want to refactor both functions, dse_optimize_stmt has some early outs that apply generally, and it handles some builtin calls that we don't want to re-handle with dse_optimize_call. So I wonder if it is either possible to call the new function from inside dse_optimize_stmt instead, after we handled the return value of call for example or different refactoring can make the flow more obvious. Thanks, Richard. > + } > else if (def_operand_p > def_p = single_ssa_def_operand (stmt, SSA_OP_DEF)) > {