On Tue, 5 May 2020, Richard Biener wrote: > > This rewrites store-motion to process candidates where we can > ensure order preserving separately and with no need to disambiguate > against all stores. Those candidates we cannot handle this way > are validated to be independent on all stores (w/o TBAA) and then > processed as "unordered" (all conditionally executed stores are so > as well). > > This will necessary cause > FAIL: gcc.dg/graphite/pr80906.c scan-tree-dump graphite "isl AST to Gimple > succeeded" > because the SM previously performed is not valid for exactly the PR57359 > reason, we still perform SM of qc for the innermost loop but that's not > enough. > > There is still room for improvements because we still check some constraints > for the order preserving cases that are only necessary in the current > strict way for the unordered ones. Leaving that for the furture. > > Bootstrapped and tested on x86_64-unknown-linux-gnu, a final > SPEC 2006 evaluation is running.
A first complete 3-run looked too good so I repeated the off-noise ones which eliminated most of them as noise ... there's two consistent parts namely a ~1% regression on 401.bzip2 and a ~1% progression on 464.h264ref. Given earlier much larger (~5-10%) "consistent" noise on other tests I declare it a wash ... Means, I'll push the patch to master once 10.1 is released (unless there are comments on the patch itself). Thanks, Richard. > Thanks, > Richard. > > 2020-05-05 Richard Biener <rguent...@suse.de> > > PR tree-optimization/57359 > * tree-ssa-loop-im.c (im_mem_ref::indep_loop): Remove. > (in_mem_ref::dep_loop): Repurpose. > (LOOP_DEP_BIT): Remove. > (enum dep_kind): New. > (enum dep_state): Likewise. > (record_loop_dependence): New function to populate the > dependence cache. > (query_loop_dependence): New function to query the dependence > cache. > (memory_accesses::refs_in_loop): Rename to ... > (memory_accesses::refs_loaded_in_loop): ... this and change to > only record loads. > (outermost_indep_loop): Adjust. > (mem_ref_alloc): Likewise. > (gather_mem_refs_stmt): Likewise. > (mem_refs_may_alias_p): Add tbaa_p parameter and pass it down. > (struct sm_aux): New. > (execute_sm): Split code generation on exits, record state > into new hash-map. > (enum sm_kind): New. > (execute_sm_exit): Exit code generation part. > (sm_seq_push_down): Helper for sm_seq_valid_bb performing > dependence checking on stores reached from exits. > (sm_seq_valid_bb): New function gathering SM stores on exits. > (hoist_memory_references): Re-implement. > (refs_independent_p): Add tbaa_p parameter and pass it down. > (record_dep_loop): Remove. > (ref_indep_loop_p_1): Fold into ... > (ref_indep_loop_p): ... this and generalize for three kinds > of dependence queries. > (can_sm_ref_p): Adjust according to hoist_memory_references > changes. > (store_motion_loop): Don't do anything if the set of SM > candidates is empty. > (tree_ssa_lim_initialize): Adjust. > (tree_ssa_lim_finalize): Likewise. > > * gcc.dg/torture/pr57359-1.c: New testcase. > * gcc.dg/torture/pr57359-1.c: Likewise. > * gcc.dg/tree-ssa/ssa-lim-14.c: Likewise. > --- > gcc/testsuite/gcc.dg/torture/pr57359-1.c | 23 ++ > gcc/testsuite/gcc.dg/torture/pr57359-2.c | 30 ++ > gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c | 33 ++ > gcc/tree-ssa-loop-im.c | 576 > +++++++++++++++++++++++------ > 4 files changed, 548 insertions(+), 114 deletions(-) > create mode 100644 gcc/testsuite/gcc.dg/torture/pr57359-1.c > create mode 100644 gcc/testsuite/gcc.dg/torture/pr57359-2.c > create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c > > diff --git a/gcc/testsuite/gcc.dg/torture/pr57359-1.c > b/gcc/testsuite/gcc.dg/torture/pr57359-1.c > new file mode 100644 > index 00000000000..f5a406a34d6 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/torture/pr57359-1.c > @@ -0,0 +1,23 @@ > +/* { dg-do run } */ > + > +extern void abort(); > + > +typedef int A; > +typedef float B; > + > +void __attribute__((noinline,noclone)) > +foo(A *p, B *q, long unk) > +{ > + for (long i = 0; i < unk; ++i) { > + *p = 1; > + q[i] = 42; > + } > +} > + > +int main(void) > +{ > + union { A x; B f; } u; > + foo(&u.x, &u.f, 1); > + if (u.f != 42) abort(); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/torture/pr57359-2.c > b/gcc/testsuite/gcc.dg/torture/pr57359-2.c > new file mode 100644 > index 00000000000..ce7d9890af4 > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/torture/pr57359-2.c > @@ -0,0 +1,30 @@ > +/* { dg-do run } */ > +/* { dg-additional-options "-fstrict-aliasing" } */ > + > +extern void abort(); > + > +typedef int A; > +typedef float B; > + > +void __attribute__((noipa)) > +foo(A * p, B *r, long unk, long oh) > +{ > + for (long i = 0; i < unk; ++i) { > + *p = 1; > + *r = 2; > + if (oh & i) > + break; > + *r = 3; > + *p = 4; > + } > +} > + > +int main(void) > +{ > + union { A x; B f; } u; > + foo(&u.x, &u.f, 1, 1); > + if (u.x != 4) abort(); > + foo(&u.x, &u.f, 2, 1); > + if (u.f != 2) abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c > b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c > new file mode 100644 > index 00000000000..3eb5be8b23f > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c > @@ -0,0 +1,33 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -fdump-tree-lim-details" } */ > + > +extern void abort(); > + > +typedef int A; > +typedef float B; > + > +void __attribute__((noinline,noclone)) > +foo(A * p, B *r, long unk, long oh) > +{ > + for (long i = 0; i < unk; ++i) { > + *p = 1; > + *r = 2; > + if (oh & i) > + break; > + *r = 3; > + *p = 4; > + } > +} > + > +int main(void) > +{ > + union { A x; B f; } u; > + foo(&u.x, &u.f, 1, 1); > + if (u.x != 4) abort(); > + foo(&u.x, &u.f, 2, 1); > + if (u.f != 2) abort (); > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "Executing store motion of \\*p" "lim2" } } */ > +/* { dg-final { scan-tree-dump "Executing store motion of \\*r" "lim2" } } */ > diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c > index 18e5c18c17e..724882c3eb6 100644 > --- a/gcc/tree-ssa-loop-im.c > +++ b/gcc/tree-ssa-loop-im.c > @@ -133,24 +133,41 @@ public: > /* The locations of the accesses. Vector > indexed by the loop number. */ > > - /* The following sets are computed on demand. We keep both set and > - its complement, so that we know whether the information was > - already computed or not. */ > - bitmap_head indep_loop; /* The set of loops in that the memory > - reference is independent, meaning: > - If it is stored in the loop, this store > - is independent on all other loads and > - stores. > - If it is only loaded, then it is independent > - on all stores in the loop. */ > - bitmap_head dep_loop; /* The complement of INDEP_LOOP. */ > + /* The following set is computed on demand. */ > + bitmap_head dep_loop; /* The set of loops in that the memory > + reference is {in,}dependent in > + different modes. */ > }; > > -/* We use two bits per loop in the ref->{in,}dep_loop bitmaps, the first > - to record (in)dependence against stores in the loop and its subloops, the > - second to record (in)dependence against all references in the loop > - and its subloops. */ > -#define LOOP_DEP_BIT(loopnum, storedp) (2 * (loopnum) + (storedp ? 1 : 0)) > +/* We use six bits per loop in the ref->dep_loop bitmap to record > + the dep_kind x dep_state combinations. */ > + > +enum dep_kind { lim_raw, sm_war, sm_waw }; > +enum dep_state { dep_unknown, dep_independent, dep_dependent }; > + > +/* Populate the loop dependence cache of REF for LOOP, KIND with STATE. */ > + > +static void > +record_loop_dependence (class loop *loop, im_mem_ref *ref, > + dep_kind kind, dep_state state) > +{ > + gcc_assert (state != dep_unknown); > + unsigned bit = 6 * loop->num + kind * 2 + state == dep_dependent ? 1 : 0; > + bitmap_set_bit (&ref->dep_loop, bit); > +} > + > +/* Query the loop dependence cache of REF for LOOP, KIND. */ > + > +static dep_state > +query_loop_dependence (class loop *loop, im_mem_ref *ref, dep_kind kind) > +{ > + unsigned first_bit = 6 * loop->num + kind * 2; > + if (bitmap_bit_p (&ref->dep_loop, first_bit)) > + return dep_independent; > + else if (bitmap_bit_p (&ref->dep_loop, first_bit + 1)) > + return dep_dependent; > + return dep_unknown; > +} > > /* Mem_ref hashtable helpers. */ > > @@ -211,7 +228,7 @@ static struct > vec<im_mem_ref *> refs_list; > > /* The set of memory references accessed in each loop. */ > - vec<bitmap_head> refs_in_loop; > + vec<bitmap_head> refs_loaded_in_loop; > > /* The set of memory references stored in each loop. */ > vec<bitmap_head> refs_stored_in_loop; > @@ -227,8 +244,9 @@ static struct > static bitmap_obstack lim_bitmap_obstack; > static obstack mem_ref_obstack; > > -static bool ref_indep_loop_p (class loop *, im_mem_ref *); > +static bool ref_indep_loop_p (class loop *, im_mem_ref *, dep_kind); > static bool ref_always_accessed_p (class loop *, im_mem_ref *, bool); > +static bool refs_independent_p (im_mem_ref *, im_mem_ref *, bool = true); > > /* Minimum cost of an expensive expression. */ > #define LIM_EXPENSIVE ((unsigned) param_lim_expensive) > @@ -573,10 +591,10 @@ outermost_indep_loop (class loop *outer, class loop > *loop, im_mem_ref *ref) > aloop != loop; > aloop = superloop_at_depth (loop, loop_depth (aloop) + 1)) > if ((!ref->stored || !bitmap_bit_p (ref->stored, aloop->num)) > - && ref_indep_loop_p (aloop, ref)) > + && ref_indep_loop_p (aloop, ref, lim_raw)) > return aloop; > > - if (ref_indep_loop_p (loop, ref)) > + if (ref_indep_loop_p (loop, ref, lim_raw)) > return loop; > else > return NULL; > @@ -1398,7 +1416,6 @@ mem_ref_alloc (ao_ref *mem, unsigned hash, unsigned id) > ref->hash = hash; > ref->stored = NULL; > ref->loaded = NULL; > - bitmap_initialize (&ref->indep_loop, &lim_bitmap_obstack); > bitmap_initialize (&ref->dep_loop, &lim_bitmap_obstack); > ref->accesses_in_loop.create (1); > > @@ -1589,14 +1606,17 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt) > > record_mem_ref_loc (ref, stmt, mem); > } > - bitmap_set_bit (&memory_accesses.refs_in_loop[loop->num], ref->id); > if (is_stored) > { > bitmap_set_bit (&memory_accesses.refs_stored_in_loop[loop->num], > ref->id); > mark_ref_stored (ref, loop); > } > - else > - mark_ref_loaded (ref, loop); > + /* A not simple memory op is also a read when it is a write. */ > + if (!is_stored || id == UNANALYZABLE_MEM_ID) > + { > + bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], > ref->id); > + mark_ref_loaded (ref, loop); > + } > init_lim_data (stmt)->ref = ref->id; > return; > } > @@ -1698,7 +1718,8 @@ analyze_memory_references (void) > > static bool > mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2, > - hash_map<tree, name_expansion *> **ttae_cache) > + hash_map<tree, name_expansion *> **ttae_cache, > + bool tbaa_p) > { > /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same > object and their offset differ in such a way that the locations cannot > @@ -1707,7 +1728,7 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref > *mem2, > aff_tree off1, off2; > > /* Perform basic offset and type-based disambiguation. */ > - if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, true)) > + if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, tbaa_p)) > return false; > > /* The expansion of addresses may be a bit expensive, thus we only do > @@ -2096,23 +2117,28 @@ execute_sm_if_changed_flag_set (class loop *loop, > im_mem_ref *ref, > return flag; > } > > +struct sm_aux > +{ > + tree tmp_var; > + tree store_flag; > + hash_set <basic_block> flag_bbs; > +}; > + > /* Executes store motion of memory reference REF from LOOP. > Exits from the LOOP are stored in EXITS. The initialization of the > temporary variable is put to the preheader of the loop, and assignments > to the reference from the temporary variable are emitted to exits. */ > > static void > -execute_sm (class loop *loop, vec<edge> exits, im_mem_ref *ref) > +execute_sm (class loop *loop, im_mem_ref *ref, > + hash_map<im_mem_ref *, sm_aux *> &aux_map) > { > - tree tmp_var, store_flag = NULL_TREE; > - unsigned i; > gassign *load; > struct fmt_data fmt_data; > - edge ex; > struct lim_aux_data *lim_data; > bool multi_threaded_model_p = false; > gimple_stmt_iterator gsi; > - hash_set<basic_block> flag_bbs; > + sm_aux *aux = new sm_aux; > > if (dump_file && (dump_flags & TDF_DETAILS)) > { > @@ -2121,8 +2147,8 @@ execute_sm (class loop *loop, vec<edge> exits, > im_mem_ref *ref) > fprintf (dump_file, " from loop %d\n", loop->num); > } > > - tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), > - get_lsm_tmp_name (ref->mem.ref, ~0)); > + aux->tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), > + get_lsm_tmp_name (ref->mem.ref, ~0)); > > fmt_data.loop = loop; > fmt_data.orig_loop = loop; > @@ -2134,9 +2160,15 @@ execute_sm (class loop *loop, vec<edge> exits, > im_mem_ref *ref) > multi_threaded_model_p = true; > > if (multi_threaded_model_p) > - store_flag = execute_sm_if_changed_flag_set (loop, ref, &flag_bbs); > + aux->store_flag > + = execute_sm_if_changed_flag_set (loop, ref, &aux->flag_bbs); > + else > + aux->store_flag = NULL_TREE; > + > + /* Remember variable setup. */ > + aux_map.put (ref, aux); > > - rewrite_mem_refs (loop, ref, tmp_var); > + rewrite_mem_refs (loop, ref, aux->tmp_var); > > /* Emit the load code on a random exit edge or into the latch if > the loop does not exit, so that we are sure it will be processed > @@ -2148,7 +2180,7 @@ execute_sm (class loop *loop, vec<edge> exits, > im_mem_ref *ref) > away later. */ > if (ref->loaded && bitmap_bit_p (ref->loaded, loop->num)) > { > - load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); > + load = gimple_build_assign (aux->tmp_var, unshare_expr (ref->mem.ref)); > lim_data = init_lim_data (load); > lim_data->max_loop = loop; > lim_data->tgt_loop = loop; > @@ -2157,24 +2189,269 @@ execute_sm (class loop *loop, vec<edge> exits, > im_mem_ref *ref) > > if (multi_threaded_model_p) > { > - load = gimple_build_assign (store_flag, boolean_false_node); > + load = gimple_build_assign (aux->store_flag, boolean_false_node); > lim_data = init_lim_data (load); > lim_data->max_loop = loop; > lim_data->tgt_loop = loop; > gsi_insert_before (&gsi, load, GSI_SAME_STMT); > } > +} > > - /* Sink the store to every exit from the loop. */ > - FOR_EACH_VEC_ELT (exits, i, ex) > - if (!multi_threaded_model_p) > +/* sm_ord is used for ordinary stores we can retain order with respect > + to other stores > + sm_unord is used for conditional executed stores which need to be > + able to execute in arbitrary order with respect to other stores > + sm_other is used for stores we do not try to apply store motion to. */ > +enum sm_kind { sm_ord, sm_unord, sm_other }; > +typedef std::pair<unsigned, sm_kind> seq_entry; > + > +static void > +execute_sm_exit (class loop *loop, edge ex, vec<seq_entry> &seq, > + hash_map<im_mem_ref *, sm_aux *> &aux_map, sm_kind kind) > +{ > + /* Sink the stores to exit from the loop. */ > + for (unsigned i = seq.length (); i > 0; --i) > + { > + if (seq[i-1].second != kind) > + continue; > + im_mem_ref *ref = memory_accesses.refs_list[seq[i-1].first]; > + sm_aux *aux = *aux_map.get (ref); > + if (!aux->store_flag) > + { > + gassign *store; > + store = gimple_build_assign (unshare_expr (ref->mem.ref), > + aux->tmp_var); > + gsi_insert_on_edge (ex, store); > + } > + else > + execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, aux->store_flag, > + loop_preheader_edge (loop), &aux->flag_bbs); > + } > +} > + > +/* Push the SM candidate at index PTR in the sequence SEQ down until > + we hit the next SM candidate. Return true if that went OK and > + false if we could not disambiguate agains another unrelated ref. */ > + > +static bool > +sm_seq_push_down (vec<seq_entry> &seq, unsigned ptr) > +{ > + for (; ptr > 0; --ptr) > + { > + seq_entry &new_cand = seq[ptr]; > + seq_entry &against = seq[ptr-1]; > + if (against.second == sm_ord) > + /* Found the tail of the sequence. */ > + break; > + if (!refs_independent_p (memory_accesses.refs_list[new_cand.first], > + memory_accesses.refs_list[against.first], > + false)) > + /* ??? Prune new_cand from the list of refs to apply SM to. */ > + return false; > + std::swap (new_cand, against); > + } > + return true; > +} > + > +/* Computes the sequence of stores from candidates in REFS_NOT_IN_SEQ to SEQ > + walking backwards from VDEF (or the end of BB if VDEF is NULL). */ > + > +static int > +sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, > + vec<seq_entry> &seq, bitmap refs_not_in_seq, > + bitmap refs_not_supported, bool forked) > +{ > + if (!vdef) > + for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi); > + gsi_prev (&gsi)) > { > - gassign *store; > - store = gimple_build_assign (unshare_expr (ref->mem.ref), tmp_var); > - gsi_insert_on_edge (ex, store); > + vdef = gimple_vdef (gsi_stmt (gsi)); > + if (vdef) > + break; > } > - else > - execute_sm_if_changed (ex, ref->mem.ref, tmp_var, store_flag, > - loop_preheader_edge (loop), &flag_bbs); > + if (!vdef) > + { > + gphi *vphi = get_virtual_phi (bb); > + if (vphi) > + vdef = gimple_phi_result (vphi); > + } > + if (!vdef) > + { > + if (single_pred_p (bb)) > + /* This handles the perfect nest case. */ > + return sm_seq_valid_bb (loop, single_pred (bb), vdef, > + seq, refs_not_in_seq, refs_not_supported, > + forked); > + return 0; > + } > + do > + { > + gimple *def = SSA_NAME_DEF_STMT (vdef); > + if (gimple_bb (def) != bb) > + { > + /* If we forked by processing a PHI do not allow our walk to > + merge again until we handle that robustly. */ > + if (forked) > + { > + /* Mark refs_not_in_seq as unsupported. */ > + bitmap_ior_into (refs_not_supported, refs_not_in_seq); > + return 1; > + } > + /* Otherwise it doesn't really matter if we end up in different > + BBs. */ > + bb = gimple_bb (def); > + } > + if (gphi *phi = dyn_cast <gphi *> (def)) > + { > + /* Handle CFG merges. Until we handle forks (gimple_bb (def) != bb) > + this is still linear. > + Eventually we want to cache intermediate results per BB > + (but we can't easily cache for different exits?). */ > + /* Stop at PHIs with possible backedges. */ > + if (bb == bb->loop_father->header > + || bb->flags & BB_IRREDUCIBLE_LOOP) > + { > + /* Mark refs_not_in_seq as unsupported. */ > + bitmap_ior_into (refs_not_supported, refs_not_in_seq); > + return 1; > + } > + if (gimple_phi_num_args (phi) == 1) > + return sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, > + gimple_phi_arg_def (phi, 0), seq, > + refs_not_in_seq, refs_not_supported, > + false); > + auto_vec<seq_entry> first_edge_seq; > + auto_bitmap tem_refs_not_in_seq (&lim_bitmap_obstack); > + int eret; > + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); > + eret = sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, > + gimple_phi_arg_def (phi, 0), > + first_edge_seq, > + tem_refs_not_in_seq, refs_not_supported, > + true); > + if (eret != 1) > + return -1; > + /* Simplify our lives by pruning the sequence of !sm_ord. */ > + while (!first_edge_seq.is_empty () > + && first_edge_seq.last ().second != sm_ord) > + first_edge_seq.pop (); > + for (unsigned int i = 1; i < gimple_phi_num_args (phi); ++i) > + { > + tree vuse = gimple_phi_arg_def (phi, i); > + edge e = gimple_phi_arg_edge (phi, i); > + auto_vec<seq_entry> edge_seq; > + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); > + eret = sm_seq_valid_bb (loop, e->src, vuse, edge_seq, > + tem_refs_not_in_seq, refs_not_supported, > + true); > + if (eret != 1) > + return -1; > + /* Simplify our lives by pruning the sequence of !sm_ord. */ > + while (!edge_seq.is_empty () > + && edge_seq.last ().second != sm_ord) > + edge_seq.pop (); > + unsigned min_len = MIN(first_edge_seq.length (), > + edge_seq.length ()); > + /* Incrementally merge seqs into first_edge_seq. */ > + for (unsigned int i = 0; i < min_len; ++i) > + { > + /* ??? We can more intelligently merge when we face different > + order by additional sinking operations in one sequence. > + For now we simply mark them as to be processed by the > + not order-preserving SM code. */ > + if (first_edge_seq[i].first != edge_seq[i].first) > + { > + bitmap_set_bit (refs_not_supported, > + first_edge_seq[i].first); > + bitmap_set_bit (refs_not_supported, edge_seq[i].first); > + first_edge_seq[i].second = sm_unord; > + } > + /* sm_unord prevails. */ > + else if (first_edge_seq[i].second != edge_seq[i].second) > + { > + /* This is just an optimization. */ > + gcc_assert (bitmap_bit_p (refs_not_supported, > + first_edge_seq[i].first)); > + first_edge_seq[i].second = sm_unord; > + } > + } > + /* Any excess elements become sm_unord since they are now > + coonditionally executed. */ > + if (first_edge_seq.length () > edge_seq.length ()) > + { > + for (unsigned i = edge_seq.length (); > + i < first_edge_seq.length (); ++i) > + { > + bitmap_set_bit (refs_not_supported, > + first_edge_seq[i].first); > + first_edge_seq[i].second = sm_unord; > + } > + } > + else if (edge_seq.length () > first_edge_seq.length ()) > + { > + for (unsigned i = first_edge_seq.length (); > + i < edge_seq.length (); ++i) > + bitmap_set_bit (refs_not_supported, edge_seq[i].first); > + } > + } > + /* Use the sequence from the first edge and push SMs down. */ > + for (unsigned i = 0; i < first_edge_seq.length (); ++i) > + { > + if (first_edge_seq[i].second == sm_other) > + break; > + unsigned id = first_edge_seq[i].first; > + seq.safe_push (first_edge_seq[i]); > + if (first_edge_seq[i].second == sm_ord > + && !sm_seq_push_down (seq, seq.length () - 1)) > + { > + bitmap_set_bit (refs_not_supported, id); > + /* ??? Mark it sm_unord but it's now "somewhere" ... */ > + for (unsigned i = seq.length (); i != 0; --i) > + if (seq[i - 1].first == id) > + { > + seq[i - 1].second = sm_unord; > + break; > + } > + } > + } > + return 1; > + } > + lim_aux_data *data = get_lim_data (def); > + gcc_assert (data); > + if (data->ref == UNANALYZABLE_MEM_ID) > + return -1; > + /* One of the stores we want to apply SM to and we've not yet seen. */ > + else if (bitmap_clear_bit (refs_not_in_seq, data->ref)) > + { > + seq.safe_push (std::make_pair (data->ref, sm_ord)); > + > + /* 1) push it down the queue until a SMed > + and not ignored ref is reached, skipping all not SMed refs > + and ignored refs via non-TBAA disambiguation. */ > + if (!sm_seq_push_down (seq, seq.length () - 1)) > + { > + bitmap_set_bit (refs_not_supported, data->ref); > + /* ??? Mark it sm_unord but it's now "somewhere" ... */ > + for (unsigned i = seq.length (); i != 0; --i) > + if (seq[i - 1].first == data->ref) > + { > + seq[i - 1].second = sm_unord; > + break; > + } > + } > + > + /* 2) check whether we've seen all refs we want to SM and if so > + declare success for the active exit */ > + if (bitmap_empty_p (refs_not_in_seq)) > + return 1; > + } > + else > + /* Another store not part of the final sequence. Simply push it. */ > + seq.safe_push (std::make_pair (data->ref, sm_other)); > + > + vdef = gimple_vuse (def); > + } > + while (1); > } > > /* Hoists memory references MEM_REFS out of LOOP. EXITS is the list of exit > @@ -2188,11 +2465,104 @@ hoist_memory_references (class loop *loop, bitmap > mem_refs, > unsigned i; > bitmap_iterator bi; > > + /* To address PR57359 before actually applying store-motion check > + the candidates found for validity with regards to reordering > + relative to other stores which we until here disambiguated using > + TBAA which isn't valid. > + What matters is the order of the last stores to the mem_refs > + with respect to the other stores of the loop at the point of the > + loop exits. */ > + > + /* For each exit compute the store order, pruning from mem_refs > + on the fly. */ > + /* The complexity of this is at least > + O(number of exits * number of SM refs) but more approaching > + O(number of exits * number of SM refs * number of stores). */ > + /* ??? Somehow do this in a single sweep over the loop body. */ > + auto_vec<std::pair<edge, vec<seq_entry> > > sms; > + auto_bitmap refs_not_supported (&lim_bitmap_obstack); > + edge e; > + FOR_EACH_VEC_ELT (exits, i, e) > + { > + vec<seq_entry> seq; > + seq.create (4); > + auto_bitmap refs_not_in_seq (&lim_bitmap_obstack); > + bitmap_copy (refs_not_in_seq, mem_refs); > + int res = sm_seq_valid_bb (loop, e->src, NULL_TREE, > + seq, refs_not_in_seq, > + refs_not_supported, false); > + if (res != 1) > + { > + bitmap_copy (refs_not_supported, mem_refs); > + break; > + } > + sms.safe_push (std::make_pair (e, seq)); > + } > + > + /* Prune pruned mem_refs from earlier processed exits. */ > + bool changed = !bitmap_empty_p (refs_not_supported); > + while (changed) > + { > + changed = false; > + std::pair<edge, vec<seq_entry> > *seq; > + FOR_EACH_VEC_ELT (sms, i, seq) > + { > + for (unsigned i = 0; i < seq->second.length (); ++i) > + { > + if (seq->second[i].second == sm_other) > + break; > + unsigned id = seq->second[i].first; > + if (bitmap_bit_p (refs_not_supported, id)) > + seq->second[i].second = sm_other; > + else if (!sm_seq_push_down (seq->second, i)) > + { > + if (bitmap_set_bit (refs_not_supported, id)) > + changed = true; > + } > + } > + } > + } > + > + /* Verify dependence for refs we cannot handle with the order preserving > + code (refs_not_supported) or prune them from mem_refs. */ > + auto_vec<seq_entry> unord_refs; > + EXECUTE_IF_SET_IN_BITMAP (refs_not_supported, 0, i, bi) > + { > + ref = memory_accesses.refs_list[i]; > + if (!ref_indep_loop_p (loop, ref, sm_waw)) > + bitmap_clear_bit (mem_refs, i); > + /* We've now verified store order for ref with respect to all other > + stores in the loop does not matter. */ > + else > + unord_refs.safe_push (std::make_pair (i, sm_unord)); > + } > + > + hash_map<im_mem_ref *, sm_aux *> aux_map; > + > + /* Execute SM but delay the store materialization for ordered > + sequences on exit. */ > EXECUTE_IF_SET_IN_BITMAP (mem_refs, 0, i, bi) > { > ref = memory_accesses.refs_list[i]; > - execute_sm (loop, exits, ref); > + execute_sm (loop, ref, aux_map); > } > + > + /* Materialize ordered store sequences on exits. */ > + FOR_EACH_VEC_ELT (exits, i, e) > + { > + if (i < sms.length ()) > + { > + gcc_assert (sms[i].first == e); > + execute_sm_exit (loop, e, sms[i].second, aux_map, sm_ord); > + sms[i].second.release (); > + } > + if (!unord_refs.is_empty ()) > + execute_sm_exit (loop, e, unord_refs, aux_map, sm_unord); > + } > + > + for (hash_map<im_mem_ref *, sm_aux *>::iterator iter = aux_map.begin (); > + iter != aux_map.end (); ++iter) > + delete (*iter).second; > } > > class ref_always_accessed > @@ -2248,7 +2618,7 @@ ref_always_accessed_p (class loop *loop, im_mem_ref > *ref, bool stored_p) > /* Returns true if REF1 and REF2 are independent. */ > > static bool > -refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) > +refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2, bool tbaa_p) > { > if (ref1 == ref2) > return true; > @@ -2257,7 +2627,7 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) > fprintf (dump_file, "Querying dependency of refs %u and %u: ", > ref1->id, ref2->id); > > - if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache)) > + if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache, tbaa_p)) > { > if (dump_file && (dump_flags & TDF_DETAILS)) > fprintf (dump_file, "dependent.\n"); > @@ -2271,32 +2641,23 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref > *ref2) > } > } > > -/* Mark REF dependent on stores or loads (according to STORED_P) in LOOP > - and its super-loops. */ > - > -static void > -record_dep_loop (class loop *loop, im_mem_ref *ref, bool stored_p) > -{ > - /* We can propagate dependent-in-loop bits up the loop > - hierarchy to all outer loops. */ > - while (loop != current_loops->tree_root > - && bitmap_set_bit (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) > - loop = loop_outer (loop); > -} > - > -/* Returns true if REF is independent on all other memory > - references in LOOP. */ > +/* Returns true if REF is independent on all other accessess in LOOP. > + KIND specifies the kind of dependence to consider. > + lim_raw assumes REF is not stored in LOOP and disambiguates RAW > + dependences so if true REF can be hoisted out of LOOP > + sm_war disambiguates a store REF against all other loads to see > + whether the store can be sunk across loads out of LOOP > + sm_waw disambiguates a store REF against all other stores to see > + whether the store can be sunk across stores out of LOOP. */ > > static bool > -ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) > +ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind) > { > - stored_p |= (ref->stored && bitmap_bit_p (ref->stored, loop->num)); > - > bool indep_p = true; > bitmap refs_to_check; > > - if (stored_p) > - refs_to_check = &memory_accesses.refs_in_loop[loop->num]; > + if (kind == sm_war) > + refs_to_check = &memory_accesses.refs_loaded_in_loop[loop->num]; > else > refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num]; > > @@ -2304,15 +2665,15 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref > *ref, bool stored_p) > indep_p = false; > else > { > - if (bitmap_bit_p (&ref->indep_loop, LOOP_DEP_BIT (loop->num, > stored_p))) > - return true; > - if (bitmap_bit_p (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) > - return false; > + /* tri-state, { unknown, independent, dependent } */ > + dep_state state = query_loop_dependence (loop, ref, kind); > + if (state != dep_unknown) > + return state == dep_independent ? true : false; > > class loop *inner = loop->inner; > while (inner) > { > - if (!ref_indep_loop_p_1 (inner, ref, stored_p)) > + if (!ref_indep_loop_p (inner, ref, kind)) > { > indep_p = false; > break; > @@ -2327,7 +2688,7 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, > bool stored_p) > EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi) > { > im_mem_ref *aref = memory_accesses.refs_list[i]; > - if (!refs_independent_p (ref, aref)) > + if (!refs_independent_p (ref, aref, kind != sm_waw)) > { > indep_p = false; > break; > @@ -2337,44 +2698,17 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref > *ref, bool stored_p) > } > > if (dump_file && (dump_flags & TDF_DETAILS)) > - fprintf (dump_file, "Querying dependencies of ref %u in loop %d: %s\n", > + fprintf (dump_file, "Querying %s dependencies of ref %u in loop %d: > %s\n", > + kind == lim_raw ? "RAW" : (kind == sm_war ? "SM WAR" : "SM WAW"), > ref->id, loop->num, indep_p ? "independent" : "dependent"); > > /* Record the computed result in the cache. */ > - if (indep_p) > - { > - if (bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, > stored_p)) > - && stored_p) > - { > - /* If it's independend against all refs then it's independent > - against stores, too. */ > - bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, false)); > - } > - } > - else > - { > - record_dep_loop (loop, ref, stored_p); > - if (!stored_p) > - { > - /* If it's dependent against stores it's dependent against > - all refs, too. */ > - record_dep_loop (loop, ref, true); > - } > - } > + record_loop_dependence (loop, ref, kind, > + indep_p ? dep_independent : dep_dependent); > > return indep_p; > } > > -/* Returns true if REF is independent on all other memory references in > - LOOP. */ > - > -static bool > -ref_indep_loop_p (class loop *loop, im_mem_ref *ref) > -{ > - gcc_checking_assert (MEM_ANALYZABLE (ref)); > - > - return ref_indep_loop_p_1 (loop, ref, false); > -} > > /* Returns true if we can perform store motion of REF from LOOP. */ > > @@ -2404,12 +2738,25 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref) > base = get_base_address (ref->mem.ref); > if ((tree_could_trap_p (ref->mem.ref) > || (DECL_P (base) && TREE_READONLY (base))) > + /* ??? We can at least use false here, allowing loads? We > + are forcing conditional stores if the ref is not always > + stored to later anyway. So this would only guard > + the load we need to emit. Thus when the ref is not > + loaded we can elide this completely? */ > && !ref_always_accessed_p (loop, ref, true)) > return false; > > - /* And it must be independent on all other memory references > - in LOOP. */ > - if (!ref_indep_loop_p (loop, ref)) > + /* Verify all loads of ref can be hoisted. */ > + if (ref->loaded > + && bitmap_bit_p (ref->loaded, loop->num) > + && !ref_indep_loop_p (loop, ref, lim_raw)) > + return false; > + > + /* Verify the candidate can be disambiguated against all loads, > + that is, we can elide all in-loop stores. Disambiguation > + against stores is done later when we cannot guarantee preserving > + the order of stores. */ > + if (!ref_indep_loop_p (loop, ref, sm_war)) > return false; > > return true; > @@ -2467,7 +2814,8 @@ store_motion_loop (class loop *loop, bitmap sm_executed) > if (loop_suitable_for_sm (loop, exits)) > { > find_refs_for_sm (loop, sm_executed, sm_in_loop); > - hoist_memory_references (loop, sm_in_loop, exits); > + if (!bitmap_empty_p (sm_in_loop)) > + hoist_memory_references (loop, sm_in_loop, exits); > } > exits.release (); > > @@ -2623,8 +2971,8 @@ tree_ssa_lim_initialize (void) > memory_accesses.refs_list.quick_push > (mem_ref_alloc (NULL, 0, UNANALYZABLE_MEM_ID)); > > - memory_accesses.refs_in_loop.create (number_of_loops (cfun)); > - memory_accesses.refs_in_loop.quick_grow (number_of_loops (cfun)); > + memory_accesses.refs_loaded_in_loop.create (number_of_loops (cfun)); > + memory_accesses.refs_loaded_in_loop.quick_grow (number_of_loops (cfun)); > memory_accesses.refs_stored_in_loop.create (number_of_loops (cfun)); > memory_accesses.refs_stored_in_loop.quick_grow (number_of_loops (cfun)); > memory_accesses.all_refs_stored_in_loop.create (number_of_loops (cfun)); > @@ -2632,7 +2980,7 @@ tree_ssa_lim_initialize (void) > > for (i = 0; i < number_of_loops (cfun); i++) > { > - bitmap_initialize (&memory_accesses.refs_in_loop[i], > + bitmap_initialize (&memory_accesses.refs_loaded_in_loop[i], > &lim_bitmap_obstack); > bitmap_initialize (&memory_accesses.refs_stored_in_loop[i], > &lim_bitmap_obstack); > @@ -2675,7 +3023,7 @@ tree_ssa_lim_finalize (void) > memory_accesses.refs_list.release (); > obstack_free (&mem_ref_obstack, NULL); > > - memory_accesses.refs_in_loop.release (); > + memory_accesses.refs_loaded_in_loop.release (); > memory_accesses.refs_stored_in_loop.release (); > memory_accesses.all_refs_stored_in_loop.release (); > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg, Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)