This improves LIM by recording aggregate copies for disambiguation
purposes instead of as UNANALYZABLE_MEM which will prevent any
invariant or store motion across it.  This allows four of the six
references in the loop of the testcase to be promoted.

Bootstrapped and tested on x86_64-unknown-linux-gnu, pushed.

2021-07-07  Richard Biener  <rguent...@suse.de>

        PR tree-optimization/99728
        * tree-ssa-loop-im.c (gather_mem_refs_stmt): Record
        aggregate copies.
        (mem_refs_may_alias_p): Add assert we handled aggregate
        copies elsewhere.
        (sm_seq_valid_bb): Give up when running into aggregate copies.
        (ref_indep_loop_p): Handle aggregate copies as never
        being invariant themselves but allow other refs to be
        disambiguated against them.
        (can_sm_ref_p): Do not try to apply store-motion to aggregate
        copies.

        * g++.dg/opt/pr99728.C: New testcase.
---
 gcc/testsuite/g++.dg/opt/pr99728.C | 50 +++++++++++++++++++++++++
 gcc/tree-ssa-loop-im.c             | 59 ++++++++++++++++++++++++++----
 2 files changed, 102 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/g++.dg/opt/pr99728.C

diff --git a/gcc/testsuite/g++.dg/opt/pr99728.C 
b/gcc/testsuite/g++.dg/opt/pr99728.C
new file mode 100644
index 00000000000..d4393231b4c
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/pr99728.C
@@ -0,0 +1,50 @@
+// PR/99728
+// { dg-do compile }
+// { dg-options "-O2 -fdump-tree-lim2-details -w -Wno-psabi" }
+
+typedef double __m256d __attribute__((vector_size(sizeof (double) * 4)));
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm256_set1_pd (double __A)
+{
+  return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+// simple OO wrapper around __m256d
+struct Tvsimple
+  {
+  __m256d v;
+  Tvsimple &operator+=(const Tvsimple &other) {v+=other.v; return *this;}
+  Tvsimple operator*(double val) const { Tvsimple res; res.v = 
v*_mm256_set1_pd(val); return res;}
+  Tvsimple operator*(Tvsimple val) const { Tvsimple res; res.v = v*val.v; 
return res; }
+  Tvsimple operator+(Tvsimple val) const { Tvsimple res; res.v = v+val.v; 
return res; }
+  Tvsimple operator+(double val) const { Tvsimple res; res.v = 
v+_mm256_set1_pd(val); return res;}
+  };
+
+template<typename vtype> struct s0data_s
+  { vtype sth, corfac, scale, lam1, lam2, csq, p1r, p1i, p2r, p2i; };
+
+template<typename vtype> void foo(s0data_s<vtype> & __restrict__ d,
+  const double * __restrict__ coef, const double * __restrict__ alm,
+  unsigned long l, unsigned long il, unsigned long lmax)
+  {
+// critical loop
+  while (l<=lmax)
+    {
+    d.p1r += d.lam2*alm[2*l];
+    d.p1i += d.lam2*alm[2*l+1];
+    d.p2r += d.lam2*alm[2*l+2];
+    d.p2i += d.lam2*alm[2*l+3];
+    Tvsimple tmp = d.lam2*(d.csq*coef[2*il] + coef[2*il+1]) + d.lam1;
+    d.lam1 = d.lam2;
+    d.lam2 = tmp;
+    ++il; l+=2;
+    }
+  }
+
+// this version has dead stores at the end of the loop
+template void foo<>(s0data_s<Tvsimple> & __restrict__ d,
+  const double * __restrict__ coef, const double * __restrict__ alm,
+  unsigned long l, unsigned long il, unsigned long lmax);
+
+// The aggregate copy in the IL should not prevent all store-motion
+// { dg-final { scan-tree-dump-times "Executing store motion" 4 "lim2" } }
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 9ac390b9a4b..81b4ec21d6e 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -122,7 +122,9 @@ public:
   hashval_t hash;              /* Its hash value.  */
 
   /* The memory access itself and associated caching of alias-oracle
-     query meta-data.  */
+     query meta-data.  We are using mem.ref == error_mark_node for the
+     case the reference is represented by its single access stmt
+     in accesses_in_loop[0].  */
   ao_ref mem;
 
   bitmap stored;               /* The set of loops in that this memory location
@@ -130,8 +132,7 @@ public:
   bitmap loaded;               /* The set of loops in that this memory location
                                   is loaded from.  */
   vec<mem_ref_loc>             accesses_in_loop;
-                               /* The locations of the accesses.  Vector
-                                  indexed by the loop number.  */
+                               /* The locations of the accesses.  */
 
   /* The following set is computed on demand.  */
   bitmap_head dep_loop;                /* The set of loops in that the memory
@@ -1465,7 +1466,22 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
     return;
 
   mem = simple_mem_ref_in_stmt (stmt, &is_stored);
-  if (!mem)
+  if (!mem && is_gimple_assign (stmt))
+    {
+      /* For aggregate copies record distinct references but use them
+        only for disambiguation purposes.  */
+      id = memory_accesses.refs_list.length ();
+      ref = mem_ref_alloc (NULL, 0, id);
+      memory_accesses.refs_list.safe_push (ref);
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       {
+         fprintf (dump_file, "Unhandled memory reference %u: ", id);
+         print_gimple_stmt (dump_file, stmt, 0, TDF_SLIM);
+       }
+      record_mem_ref_loc (ref, stmt, mem);
+      is_stored = gimple_vdef (stmt);
+    }
+  else if (!mem)
     {
       /* We use the shared mem_ref for all unanalyzable refs.  */
       id = UNANALYZABLE_MEM_ID;
@@ -1595,7 +1611,8 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt)
       mark_ref_stored (ref, loop);
     }
   /* A not simple memory op is also a read when it is a write.  */
-  if (!is_stored || id == UNANALYZABLE_MEM_ID)
+  if (!is_stored || id == UNANALYZABLE_MEM_ID
+      || ref->mem.ref == error_mark_node)
     {
       bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], 
ref->id);
       mark_ref_loaded (ref, loop);
@@ -1714,6 +1731,9 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2,
                      hash_map<tree, name_expansion *> **ttae_cache,
                      bool tbaa_p)
 {
+  gcc_checking_assert (mem1->mem.ref != error_mark_node
+                      && mem2->mem.ref != error_mark_node);
+
   /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same
      object and their offset differ in such a way that the locations cannot
      overlap, then they cannot alias.  */
@@ -2490,6 +2510,13 @@ sm_seq_valid_bb (class loop *loop, basic_block bb, tree 
vdef,
       gcc_assert (data);
       if (data->ref == UNANALYZABLE_MEM_ID)
        return -1;
+      /* Stop at memory references which we can't move.  */
+      else if (memory_accesses.refs_list[data->ref]->mem.ref == 
error_mark_node)
+       {
+         /* Mark refs_not_in_seq as unsupported.  */
+         bitmap_ior_into (refs_not_supported, refs_not_in_seq);
+         return 1;
+       }
       /* One of the stores we want to apply SM to and we've not yet seen.  */
       else if (bitmap_clear_bit (refs_not_in_seq, data->ref))
        {
@@ -2798,7 +2825,8 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, 
dep_kind kind)
   else
     refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num];
 
-  if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID))
+  if (bitmap_bit_p (refs_to_check, UNANALYZABLE_MEM_ID)
+      || ref->mem.ref == error_mark_node)
     indep_p = false;
   else
     {
@@ -2825,7 +2853,20 @@ ref_indep_loop_p (class loop *loop, im_mem_ref *ref, 
dep_kind kind)
          EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi)
            {
              im_mem_ref *aref = memory_accesses.refs_list[i];
-             if (!refs_independent_p (ref, aref, kind != sm_waw))
+             if (aref->mem.ref == error_mark_node)
+               {
+                 gimple *stmt = aref->accesses_in_loop[0].stmt;
+                 if ((kind == sm_war
+                      && ref_maybe_used_by_stmt_p (stmt, &ref->mem,
+                                                   kind != sm_waw))
+                     || stmt_may_clobber_ref_p_1 (stmt, &ref->mem,
+                                                  kind != sm_waw))
+                   {
+                     indep_p = false;
+                     break;
+                   }
+               }
+             else if (!refs_independent_p (ref, aref, kind != sm_waw))
                {
                  indep_p = false;
                  break;
@@ -2858,6 +2899,10 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref)
   if (!MEM_ANALYZABLE (ref))
     return false;
 
+  /* Can't hoist/sink aggregate copies.  */
+  if (ref->mem.ref == error_mark_node)
+    return false;
+
   /* It should be movable.  */
   if (!is_gimple_reg_type (TREE_TYPE (ref->mem.ref))
       || TREE_THIS_VOLATILE (ref->mem.ref)
-- 
2.26.2

Reply via email to