Hi,

This is the rebased and reworked version of the unroll patch.  I wasn't entirely sure whether I should compare the costs of the unrolled loop_vinfo with the original loop_vinfo it was unrolled of. I did now, but I wasn't too sure whether it was a good idea to... Any thoughts on this?

Regards,

Andre


gcc/ChangeLog:

        * tree-vect-loop.c (vect_estimate_min_profitable_iters): Add suggested_unroll_factor parameter.
        (vect_analyze_loop_costing): Likewise.
        (vect_determine_partial_vectors_and_peeling): Don't mask an unrolled loop.
        (vect_analyze_loop_2): Support unrolling of loops.
        (vect_can_unroll): New function.
        (vect_try_unrolling): New function.
        (vect_analyze_loop_1): Add suggested_unroll_factor parameter and use it.         (vect_analyze_loop): Call vect_try_unrolling when unrolling suggested.
        (vectorizable_reduction): Don't single_defuse_cycle when unrolling.
        * tree-vectorizer.h (_loop_vec_info::_loop_vec_info):  Add suggested_unroll_factor member.
        (vector_costs::vector_costs): Add m_suggested_unroll_factor member.
        (vector_costs::suggested_unroll_factor): New getter.
        (finish_cost): Add suggested_unroll_factor out parameter and set it.
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 
a28bb6321d76b8222bc8cfdade151ca9b4dca406..cfce7de0430c852d37f1a93e2d6a2f630694f613
 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -153,7 +153,8 @@ along with GCC; see the file COPYING3.  If not see
    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
-static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
+                                               unsigned *);
 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
                                               bool *, bool *);
 
@@ -828,6 +829,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
vec_info_shared *shared)
     skip_main_loop_edge (nullptr),
     skip_this_loop_edge (nullptr),
     reusable_accumulators (),
+    suggested_unroll_factor (1),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
     rgroup_compare_type (NULL_TREE),
@@ -1811,7 +1813,8 @@ vect_known_niters_smaller_than_vf (loop_vec_info 
loop_vinfo)
    definitely no, or -1 if it's worth retrying.  */
 
 static int
-vect_analyze_loop_costing (loop_vec_info loop_vinfo)
+vect_analyze_loop_costing (loop_vec_info loop_vinfo,
+                          unsigned *suggested_unroll_factor)
 {
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
@@ -1845,7 +1848,8 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
 
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
-                                     &min_profitable_estimate);
+                                     &min_profitable_estimate,
+                                     suggested_unroll_factor);
 
   if (min_profitable_iters < 0)
     {
@@ -2129,10 +2133,16 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
         vectors to the epilogue, with the main loop continuing to operate
         on full vectors.
 
+        If we are unrolling we also do not want to use partial vectors. This
+        is to avoid the overhead of generating multiple masks and also to
+        avoid having to execute entire iterations of FALSE masked instructions
+        when dealing with one or less full iterations.
+
         ??? We could then end up failing to use partial vectors if we
         decide to peel iterations into a prologue, and if the main loop
         then ends up processing fewer than VF iterations.  */
-      if (param_vect_partial_vector_usage == 1
+      if ((param_vect_partial_vector_usage == 1
+          || loop_vinfo->suggested_unroll_factor > 1)
          && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
          && !vect_known_niters_smaller_than_vf (loop_vinfo))
        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
@@ -2199,12 +2209,12 @@ vect_determine_partial_vectors_and_peeling 
(loop_vec_info loop_vinfo,
    for it.  The different analyses will record information in the
    loop_vec_info struct.  */
 static opt_result
-vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
+vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
+                    unsigned *suggested_unroll_factor, poly_uint64 min_vf = 2)
 {
   opt_result ok = opt_result::success ();
   int res;
   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
-  poly_uint64 min_vf = 2;
   loop_vec_info orig_loop_vinfo = NULL;
 
   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
@@ -2359,6 +2369,26 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal)
      set of rgroups.  */
   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
 
+  /* Apply the suggested unrolling factor, this was determined by the backend
+     during finish_cost the first time we ran the analyzis for this
+     vector mode.  */
+  if (loop_vinfo->suggested_unroll_factor > 1)
+    {
+      poly_uint64 unrolled_vf
+       = LOOP_VINFO_VECT_FACTOR (loop_vinfo) * 
loop_vinfo->suggested_unroll_factor;
+      /* Make sure the unrolled vectorization factor is less than the max
+         vectorization factor.  */
+      unsigned HOST_WIDE_INT max_vf = LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo);
+      if (max_vf == MAX_VECTORIZATION_FACTOR || known_le (unrolled_vf, max_vf))
+       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = unrolled_vf;
+      else
+       return opt_result::failure_at (vect_location,
+                                      "unrolling failed: unrolled"
+                                      " vectorization factor larger than"
+                                      " maximum vectorization factor: %d\n",
+                                      LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
+    }
+
   /* This is the point where we can re-start analysis with SLP forced off.  */
 start_over:
 
@@ -2550,7 +2580,7 @@ start_over:
     return ok;
 
   /* Check the costings of the loop make vectorizing worthwhile.  */
-  res = vect_analyze_loop_costing (loop_vinfo);
+  res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
   if (res < 0)
     {
       ok = opt_result::failure_at (vect_location,
@@ -2940,6 +2970,115 @@ vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
   return true;
 }
 
+/* Determine whether we can unroll this loop.  */
+
+static bool
+vect_can_unroll (loop_vec_info loop_vinfo)
+{
+  stmt_vec_info stmt_info;
+  unsigned i;
+  poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  if (known_le (vectorization_factor, 1U))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "will not unroll loop with a VF of 1 or less\n");
+      return false;
+    }
+
+  FOR_EACH_VEC_ELT (loop_vinfo->stmt_vec_infos, i, stmt_info)
+    {
+      if (STMT_VINFO_IN_PATTERN_P (stmt_info)
+         || !STMT_VINFO_RELEVANT_P (stmt_info)
+         || stmt_info->vectype == NULL_TREE)
+       continue;
+      /* Do not unroll loops with negative steps as it is unlikely that
+        vectorization will succeed due to the way we deal with negative steps
+        in loads and stores in 'get_load_store_type'.  */
+      if (stmt_info->dr_aux.dr
+         && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+       {
+         dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
+         tree step = vect_dr_behavior (loop_vinfo, dr_info)->step;
+         if (TREE_CODE (step) == INTEGER_CST
+             && tree_int_cst_compare (step, size_zero_node) < 0)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "could not unroll due to negative step\n");
+             return false;
+           }
+       }
+
+      if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
+       {
+         auto red_info = info_for_reduction (loop_vinfo, stmt_info);
+         if (STMT_VINFO_REDUC_TYPE (red_info) != TREE_CODE_REDUCTION)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "could not unroll loop with reduction due to "
+                                "non TREE_CODE_REDUCTION\n");
+             return false;
+           }
+       }
+    }
+
+  return true;
+}
+
+
+/* Try to unroll the current loop.  First determine the unrolling factor using
+   the analysis done for the current vector mode.  Then re-analyze the loop for
+   the given unrolling factor and the current vector mode.  */
+
+static opt_loop_vec_info
+vect_try_unrolling (loop_vec_info loop_vinfo,
+                   const vect_loop_form_info *loop_form_info,
+                   unsigned suggested_unroll_factor)
+{
+  DUMP_VECT_SCOPE ("vect_try_unrolling");
+
+  if (!vect_can_unroll (loop_vinfo))
+    return opt_loop_vec_info::failure_at (vect_location,
+                                         "*** Can not unroll this loop.\n");
+
+  loop_vec_info unrolled_vinfo
+    = vect_create_loop_vinfo (loop_vinfo->loop, loop_vinfo->shared,
+                             loop_form_info, NULL);
+  unrolled_vinfo->vector_mode = loop_vinfo->vector_mode;
+
+  /* Use the suggested_unrolling_factor that was returned at the target's
+     TARGET_VECTORIZE_FINISH_COST hook.  */
+  unrolled_vinfo->suggested_unroll_factor = suggested_unroll_factor;
+  poly_uint64 unrolled_vf
+    = LOOP_VINFO_VECT_FACTOR (loop_vinfo) * suggested_unroll_factor;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "***** unrolling factor %d chosen for vector mode %s,"
+                    "re-trying analyzis...\n",
+                    suggested_unroll_factor,
+                    GET_MODE_NAME (unrolled_vinfo->vector_mode));
+  bool unrolling_fatal = false;
+  if (vect_analyze_loop_2 (unrolled_vinfo, unrolling_fatal,
+                          &suggested_unroll_factor, unrolled_vf))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "unrolling succeeded with factor = %d\n",
+                        suggested_unroll_factor);
+      unrolled_vinfo->loop->aux = NULL;
+      return opt_loop_vec_info::success (unrolled_vinfo);
+    }
+
+  loop_vinfo->loop->aux = NULL;
+  return opt_loop_vec_info::failure_at (vect_location,
+                                       "unrolling failed with factor = %d\n",
+                                       suggested_unroll_factor);
+}
+
 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
    MODE_I to the next mode useful to analyze.
@@ -2951,7 +3090,7 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared 
*shared,
                     loop_vec_info main_loop_vinfo,
                     const vector_modes &vector_modes, unsigned &mode_i,
                     machine_mode &autodetected_vector_mode,
-                    bool &fatal)
+                    bool &fatal, unsigned int *suggested_unroll_factor = NULL)
 {
   loop_vec_info loop_vinfo
     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
@@ -2960,7 +3099,8 @@ vect_analyze_loop_1 (class loop *loop, vec_info_shared 
*shared,
   loop_vinfo->vector_mode = vector_mode;
 
   /* Run the main analysis.  */
-  opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
+  opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
+                                       suggested_unroll_factor);
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                     "***** Analysis %s with vector mode %s\n",
@@ -3081,15 +3221,40 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
     {
       unsigned int loop_vinfo_i = mode_i;
       bool fatal;
+      unsigned int suggested_unroll_factor = 1;
       opt_loop_vec_info loop_vinfo
        = vect_analyze_loop_1 (loop, shared, &loop_form_info,
                               NULL, vector_modes, mode_i,
-                              autodetected_vector_mode, fatal);
+                              autodetected_vector_mode, fatal,
+                              &suggested_unroll_factor);
       if (fatal)
        break;
 
       if (loop_vinfo)
        {
+         if (suggested_unroll_factor > 1)
+           {
+             opt_loop_vec_info unrolled_loop_vinfo
+               = vect_try_unrolling (loop_vinfo, &loop_form_info,
+                                     suggested_unroll_factor);
+             if (unrolled_loop_vinfo)
+               {
+                 if (vect_joust_loop_vinfos (unrolled_loop_vinfo, loop_vinfo))
+                   {
+                     delete loop_vinfo;
+                     loop_vinfo = unrolled_loop_vinfo;
+                   }
+                 else
+                   {
+                     if (dump_enabled_p ())
+                       dump_printf_loc (MSG_NOTE, vect_location,
+                                        "***** Original loop chosen over"
+                                        " unrolled due to costs.");
+                     delete unrolled_loop_vinfo;
+                   }
+               }
+           }
+
          /* Once we hit the desired simdlen for the first time,
             discard any previous attempts.  */
          if (simdlen
@@ -3158,10 +3323,33 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
 
+  if (first_loop_vinfo->suggested_unroll_factor > 1)
+    {
+      if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Re-trying analysis with first vector mode"
+                            " %s for epilogue with partial vectors of"
+                            " unrolled first loop.\n",
+                            GET_MODE_NAME (vector_modes[0]));
+         mode_i = 0;
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Re-trying analysis with same vector mode"
+                            " %s for epilogue of unrolled first loop.\n",
+                            GET_MODE_NAME (first_loop_vinfo->vector_mode));
+         mode_i = first_loop_i;
+       }
+    }
+
   /* Handle the case that the original loop can use partial
      vectorization, but want to only adopt it for the epilogue.
      The retry should be in the same mode as original.  */
-  if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
+  else if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
     {
       gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
                  && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
@@ -3182,6 +3370,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
   /* ???  If first_loop_vinfo was using VOIDmode then we probably
      want to instead search for the corresponding mode in vector_modes[].  */
 
+  poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
   while (1)
     {
       bool fatal;
@@ -3193,6 +3382,22 @@ vect_analyze_loop (class loop *loop, vec_info_shared 
*shared)
       if (fatal)
        break;
 
+      if (loop_vinfo
+         && known_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), first_vinfo_vf)
+         && !(LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+              && maybe_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                           first_vinfo_vf)))
+       {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "***** Will not use %s mode as an"
+                              " epilogue, since it leads to an higher"
+                              " vectorization factor than main loop\n",
+                              GET_MODE_NAME (loop_vinfo->vector_mode));
+           delete loop_vinfo;
+           loop_vinfo = opt_loop_vec_info::success (NULL);
+       }
+
       if (loop_vinfo)
        {
          if (pick_lowest_cost_p)
@@ -3905,7 +4110,8 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, 
int peel_iters_prologue,
 static void
 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                                    int *ret_min_profitable_niters,
-                                   int *ret_min_profitable_estimate)
+                                   int *ret_min_profitable_estimate,
+                                   unsigned *suggested_unroll_factor)
 {
   int min_profitable_iters;
   int min_profitable_estimate;
@@ -4265,8 +4471,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
     }
 
   /* Complete the target-specific cost calculations.  */
-  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
-              &vec_inside_cost, &vec_epilogue_cost);
+  finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo),
+              &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
+              suggested_unroll_factor);
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
 
@@ -7255,7 +7462,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
    participating.  */
   if (ncopies > 1
       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
-      && reduc_chain_length == 1)
+      && reduc_chain_length == 1
+      && loop_vinfo->suggested_unroll_factor == 1)
     single_defuse_cycle = true;
 
   if (single_defuse_cycle || lane_reduc_code_p)
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 
b552e9dccce5bce6a3bbcf5d531e7ccefa719b9a..238e0b4bf21871c518da73a79320f34e55c9201c
 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -624,6 +624,13 @@ public:
      about the reductions that generated them.  */
   hash_map<tree, vect_reusable_accumulator> reusable_accumulators;
 
+  /* The number of times that the target suggested we unroll the vector loop
+     in order to promote more ILP.  This value will be used to re-analyze the
+     loop for vectorization and if successful the value will be folded into
+     vectorization_factor (and therefore exactly divides
+     vectorization_factor).  */
+  unsigned int suggested_unroll_factor;
+
   /* Maximum runtime vectorization factor, or MAX_VECTORIZATION_FACTOR
      if there is no particular limit.  */
   unsigned HOST_WIDE_INT max_vectorization_factor;
@@ -1430,6 +1437,7 @@ public:
   unsigned int prologue_cost () const;
   unsigned int body_cost () const;
   unsigned int epilogue_cost () const;
+  unsigned int suggested_unroll_factor () const;
 
 protected:
   unsigned int record_stmt_cost (stmt_vec_info, vect_cost_model_location,
@@ -1447,6 +1455,9 @@ protected:
   /* The costs of the three regions, indexed by vect_cost_model_location.  */
   unsigned int m_costs[3];
 
+  /* The suggested unrolling factor determined at finish_cost.  */
+  unsigned int m_suggested_unroll_factor;
+
   /* True if finish_cost has been called.  */
   bool m_finished;
 };
@@ -1459,6 +1470,7 @@ vector_costs::vector_costs (vec_info *vinfo, bool 
costing_for_scalar)
   : m_vinfo (vinfo),
     m_costing_for_scalar (costing_for_scalar),
     m_costs (),
+    m_suggested_unroll_factor(1),
     m_finished (false)
 {
 }
@@ -1490,6 +1502,15 @@ vector_costs::epilogue_cost () const
   return m_costs[vect_epilogue];
 }
 
+/* Return the suggested unroll factor.  */
+
+inline unsigned int
+vector_costs::suggested_unroll_factor () const
+{
+  gcc_checking_assert (m_finished);
+  return m_suggested_unroll_factor;
+}
+
 #define VECT_MAX_COST 1000
 
 /* The maximum number of intermediate steps required in multi-step type
@@ -1665,12 +1686,15 @@ add_stmt_cost (vector_costs *costs, stmt_info_for_cost 
*i)
 
 static inline void
 finish_cost (vector_costs *costs, unsigned *prologue_cost,
-            unsigned *body_cost, unsigned *epilogue_cost)
+            unsigned *body_cost, unsigned *epilogue_cost,
+            unsigned *suggested_unroll_factor = NULL)
 {
   costs->finish_cost ();
   *prologue_cost = costs->prologue_cost ();
   *body_cost = costs->body_cost ();
   *epilogue_cost = costs->epilogue_cost ();
+  if (suggested_unroll_factor)
+    *suggested_unroll_factor = costs->suggested_unroll_factor ();
 }
 
 inline void

Reply via email to