Hi,
As analyzed in PR68303 and PR69710, vectorizer generates duplicated 
computations in loop's pre-header basic block when creating base address for 
vector reference to the same memory object.  Because the duplicated code is out 
of loop, IVOPT fails to track base object for these vector references, 
resulting in missed strength reduction.
It's agreed that vectorizer should be improved to generate optimal 
(IVOPT-friendly) code, the difficult part is we want a generic infrastructure.  
After investigation, I tried to introduce a generic/simple local CSE interface 
by reusing existing algorithm/data-structure from tree-ssa-dom 
(tree-ssa-scopedtables).  The interface runs local CSE for each basic block in 
a bitmap, customers of this interface only need to record basic blocks in the 
bitmap when necessary.  Note we don't need scopedtables' unwinding facility 
since the interface runs only for single basic block, this should be good in 
terms of compilation time.
Besides CSE issue, this patch also re-associates address expressions in 
vect_create_addr_base_for_vector_ref, specifically, it splits constant offset 
and adds it back near the expression root in IR.  This is necessary because GCC 
only handles re-association for commutative operators in CSE.

I checked its impact on various test cases.
With this patch, PR68030's generated assembly is reduced from ~750 lines to 
~580 lines on x86_64, with both pre-header and loop body simplified.  But,
1) It doesn't fix all the problem on x86_64.  Root cause is computation for 
base address of the first reference is somehow moved outside of loop's 
pre-header, local CSE can't help in this case.  Though split_constant_offset 
can back track ssa def chain, it causes possible redundant when there is no CSE 
opportunities in pre-header.
2) It causes regression for PR68030 on AArch64.  I think the regression is 
caused by IVOPT issues which are exposed by this patch.  Checks on offset 
validity in get_address_cost is wrong/inaccurate now.  It considers an offset 
as valid if it's within the maximum offset range that backend supports.  This 
is not true, for example, AArch64 requires aligned offset additionally.  For 
example, LDR [base + 2060] is invalid for V4SFmode, although "2060" is within 
the maximum offset range.  Another issue is also in get_address_cost.  
Inaccurate cost is computed for "base + offset + INDEX" address expression.  
When register pressure is low, "base+offset" can be hoisted out and we can use 
[base + INDEX] addressing mode, whichhis is current behavior.

Bootstrap and test on x86_64 and AArch64.  Any comments appreciated.

Thanks,
bin

2016-05-17 Bin Cheng  <bin.ch...@arm.com>

        PR tree-optimization/68030
        PR tree-optimization/69710
        * tree-ssa-dom.c (cse_bbs): New function.
        * tree-ssa-dom.h (cse_bbs): New declaration.
        * tree-vect-data-refs.c (vect_create_addr_base_for_vector_ref):
        Re-associate address by splitting constant offset.
        (vect_create_data_ref_ptr, vect_setup_realignment): Record changed
        basic block.
        * tree-vect-loop-manip.c (vect_gen_niters_for_prolog_loop): Record
        changed basic block.
        * tree-vectorizer.c (tree-ssa-dom.h): Include header file.
        (changed_bbs): New variable.
        (vectorize_loops): Allocate and free CHANGED_BBS.  Call cse_bbs.
        * tree-vectorizer.h (changed_bbs): New declaration.
diff --git a/gcc/tree-ssa-dom.c b/gcc/tree-ssa-dom.c
index 8bf5b3c..b2a0b0c 100644
--- a/gcc/tree-ssa-dom.c
+++ b/gcc/tree-ssa-dom.c
@@ -2090,3 +2090,50 @@ lookup_avail_expr (gimple *stmt, bool insert,
 
   return lhs;
 }
+
+/* A local CSE interface which runs CSE for basic blocks recorded in
+   CHANGED_BBS.  */
+
+void
+cse_bbs (bitmap changed_bbs)
+{
+  unsigned index;
+  bitmap_iterator bi;
+  gimple_stmt_iterator gsi;
+
+  hash_table<expr_elt_hasher> *avail_exprs;
+  class avail_exprs_stack *avail_exprs_stack;
+  class const_and_copies *const_and_copies;
+
+  avail_exprs = new hash_table<expr_elt_hasher> (1024);
+  avail_exprs_stack = new class avail_exprs_stack (avail_exprs);
+  const_and_copies = new class const_and_copies ();
+
+  threadedge_initialize_values ();
+  /* Push a marker on the stacks of local information so that we know how
+     far to unwind when we finalize this block.  */
+  avail_exprs_stack->push_marker ();
+  const_and_copies->push_marker ();
+
+  EXECUTE_IF_SET_IN_BITMAP (changed_bbs, 0, index, bi)
+    {
+      basic_block bb = BASIC_BLOCK_FOR_FN (cfun, index);
+
+      if (dump_file && (dump_flags & TDF_DETAILS))
+       fprintf (dump_file, "\n\nRun local cse on block #%d\n\n", bb->index);
+
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+       optimize_stmt (bb, gsi, const_and_copies, avail_exprs_stack);
+
+      /* Pop stacks to keep it small.  */
+      avail_exprs_stack->pop_to_marker ();
+      const_and_copies->pop_to_marker ();
+    }
+
+  delete avail_exprs;
+  avail_exprs = NULL;
+
+  delete avail_exprs_stack;
+  delete const_and_copies;
+  threadedge_finalize_values ();
+}
diff --git a/gcc/tree-ssa-dom.h b/gcc/tree-ssa-dom.h
index e6abe65..0ab1ec9 100644
--- a/gcc/tree-ssa-dom.h
+++ b/gcc/tree-ssa-dom.h
@@ -24,5 +24,6 @@ extern bool simple_iv_increment_p (gimple *);
 extern void record_temporary_equivalences (edge,
                                           class const_and_copies *,
                                           class avail_exprs_stack *);
+extern void cse_bbs (bitmap);
 
 #endif /* GCC_TREE_SSA_DOM_H */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 7652e21..ed5ef90 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -4108,23 +4108,27 @@ vect_create_addr_base_for_vector_ref (gimple *stmt,
       base_name = get_name (DR_REF (dr));
     }
 
-  /* Create base_offset */
-  base_offset = size_binop (PLUS_EXPR,
-                           fold_convert (sizetype, base_offset),
-                           fold_convert (sizetype, init));
+  base_offset = fold_convert (sizetype, base_offset);
+  init = fold_convert (sizetype, init);
 
   if (offset)
     {
       offset = fold_build2 (MULT_EXPR, sizetype,
                            fold_convert (sizetype, offset), step);
-      base_offset = fold_build2 (PLUS_EXPR, sizetype,
-                                base_offset, offset);
+      if (TREE_CODE (offset) == INTEGER_CST)
+       init = fold_build2 (PLUS_EXPR, sizetype, init, offset);
+      else
+       base_offset = fold_build2 (PLUS_EXPR, sizetype,
+                                  base_offset, offset);
     }
   if (byte_offset)
     {
       byte_offset = fold_convert (sizetype, byte_offset);
-      base_offset = fold_build2 (PLUS_EXPR, sizetype,
-                                base_offset, byte_offset);
+      if (TREE_CODE (byte_offset) == INTEGER_CST)
+       init = fold_build2 (PLUS_EXPR, sizetype, init, byte_offset);
+      else
+       base_offset = fold_build2 (PLUS_EXPR, sizetype,
+                                  base_offset, byte_offset);
     }
 
   /* base + base_offset */
@@ -4138,6 +4142,10 @@ vect_create_addr_base_for_vector_ref (gimple *stmt,
     }
 
   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
+  addr_base = force_gimple_operand (addr_base, &seq, true, NULL);
+  gimple_seq_add_seq (new_stmt_list, seq);
+  /* Add constant offset at last.  */
+  addr_base = fold_build_pointer_plus (addr_base, init);
   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
   gimple_seq_add_seq (new_stmt_list, seq);
@@ -4371,6 +4379,7 @@ vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, 
struct loop *at_loop,
         {
           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
           gcc_assert (!new_bb);
+          bitmap_set_bit (changed_bbs, pe->src->index);
         }
       else
         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
@@ -5081,9 +5090,10 @@ vect_setup_realignment (gimple *stmt, 
gimple_stmt_iterator *gsi,
                                                        NULL_TREE, loop);
           if (loop)
             {
-             pe = loop_preheader_edge (loop);
+             pe = loop_preheader_edge (loop);
              new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
              gcc_assert (!new_bb);
+             bitmap_set_bit (changed_bbs, pe->src->index);
             }
           else
              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c
index 7ec6dae..2b93048 100644
--- a/gcc/tree-vect-loop-manip.c
+++ b/gcc/tree-vect-loop-manip.c
@@ -1895,10 +1895,11 @@ vect_gen_niters_for_prolog_loop (loop_vec_info 
loop_vinfo, tree loop_niters, int
 
       new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
       gcc_assert (!new_bb);
+      bitmap_set_bit (changed_bbs, pe->src->index);
 
       /* Create:  byte_misalign = addr & (vectype_align - 1)  */
       byte_misalign =
-        fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), 
+        fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
                      vectype_align_minus_1);
 
       /* Create:  elem_misalign = byte_misalign / element_size  */
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 2b25b45..9441121 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -73,6 +73,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cfgloop.h"
 #include "tree-vectorizer.h"
 #include "tree-ssa-propagate.h"
+#include "tree-ssa-dom.h"
 #include "dbgcnt.h"
 #include "tree-scalar-evolution.h"
 
@@ -82,6 +83,9 @@ source_location vect_location;
 
 /* Vector mapping GIMPLE stmt to stmt_vec_info. */
 vec<stmt_vec_info> stmt_vec_info_vec;
+
+/* Basic blocks should be cleaned up by CSE after vectorization.  */
+bitmap changed_bbs;
 
 /* For mapping simduid to vectorization factor.  */
 
@@ -508,6 +512,7 @@ vectorize_loops (void)
     note_simd_array_uses (&simd_array_to_simduid_htab);
 
   init_stmt_vec_info_vec ();
+  changed_bbs = BITMAP_ALLOC (NULL);
 
   /*  ----------- Analyze loops. -----------  */
 
@@ -619,6 +624,9 @@ vectorize_loops (void)
       loop->aux = NULL;
     }
 
+  if (!bitmap_empty_p (changed_bbs))
+    cse_bbs (changed_bbs);
+  BITMAP_FREE (changed_bbs);
   free_stmt_vec_info_vec ();
 
   /* Fold IFN_GOMP_SIMD_{VF,LANE,LAST_LANE,ORDERED_{START,END}} builtins.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index bd1d55a..3cd2d96 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -695,6 +695,7 @@ struct dataref_aux {
 #define MAX_VECTORIZATION_FACTOR 64
 
 extern vec<stmt_vec_info> stmt_vec_info_vec;
+extern bitmap changed_bbs;
 
 void init_stmt_vec_info_vec (void);
 void free_stmt_vec_info_vec (void);

Reply via email to