On 16 Jun 18:52, Ilya Enkovich wrote:
> 2016-06-15 15:00 GMT+03:00 Richard Biener <richard.guent...@gmail.com>:
> > On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich....@gmail.com> 
> > wrote:
> >> Hi,
> >>
> >> This patch enables vectorization of loop epilogues and low trip count
> >> loops using masking.
> >
> > I wonder why we have the epilogue masking restriction with respect to
> > the original vectorization factor - shouldn't this simply be handled by
> > vectorizing the epilogue?  First trying the original VF (requires masking
> > and is equivalent to low-tripcount loop vectorization), then if that is not
> > profitable iterate to smaller VFs?   [yes, ideally we'd be able to compare
> > cost for vectorization with different VFs and choose the best VF]
> 
> When main loop is vectorized using some VF we compute epilogue masking
> profitability and generate epilogue to be vectorized and masked using exactly
> the same VF.  In ideal case we never fail to vectorize epilogue because we
> check that it can be masked.  Unfortunately we may loose some info
> when generating
> a loop copy (e.g. scev info is lost) and therefore may fail to
> vectorize epilogue.
> 
> I expect that if we loose some info and thus fail to vectorize for a
> specified VF
> (for which the main loop was successfully vectorized) then we are going to 
> fail
> to vectorize for other vector sizes too.  Actually I'd prefer to try
> the only vector
> size for vectorization with masking to save compilation time.
> 
> Thanks,
> Ilya
> 
> >
> > Thanks,
> > Richard.
> >
> >> Thanks,
> >> Ilya

Hi,

Here is an updated version.  It allows vectorization with a smaller vector size
in case we fail to vectorize with masking.

Thanks,
Ilya
--
gcc/

2016-05-24  Ilya Enkovich  <ilya.enkov...@intel.com>

        * dbgcnt.def (vect_tail_mask): New.
        * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop
        epilogues and low trip count loops.
        (vect_get_known_peeling_cost): Ignore scalar epilogue cost for
        loops we are going to mask.
        (vect_estimate_min_profitable_iters): Support masked loop
        epilogues and low trip count loops.
        * tree-vectorizer.c (vectorize_loops): Add a message for a case
        when loop epilogue can't be vectorized.


diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def
index 73c2966..5aad1d7 100644
--- a/gcc/dbgcnt.def
+++ b/gcc/dbgcnt.def
@@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra)
 DEBUG_COUNTER (vect_loop)
 DEBUG_COUNTER (vect_slp)
 DEBUG_COUNTER (vect_tail_combine)
+DEBUG_COUNTER (vect_tail_mask)
 DEBUG_COUNTER (dom_unreachable_edges)
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 95dfda9..78a6754 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2205,7 +2205,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool 
&fatal)
   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   HOST_WIDE_INT estimated_niter;
   unsigned th;
-  int min_scalar_loop_bound;
+  int min_scalar_loop_bound = 0;
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
   ok = vect_analyze_slp (loop_vinfo, n_stmts);
@@ -2230,6 +2230,45 @@ start_over:
   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   gcc_assert (vectorization_factor != 0);
 
+  /* For now we mask loop epilogue using the same VF since it was used
+     for cost estimations and it should be easier for reduction
+     optimization.  */
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor)
+    {
+      /* If we couldn't vectorize epilogue with masking then we may still
+        try to vectorize it with a smaller vector size.  */
+      if (LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) > (int)vectorization_factor
+         && flag_tree_vectorize_epilogues & VECT_EPILOGUE_NOMASK)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "couldn't mask loop epilogue; trying to vectorize "
+                            "using a smaller vector.\n");
+         LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) = false;
+         LOOP_VINFO_NEED_MASKING (loop_vinfo) = false;
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: VF for loop epilogue doesn't "
+                            "match original loop VF.\n");
+         return false;
+       }
+    }
+
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+      && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: VF for loop epilogue is too small\n");
+      return false;
+    }
+
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                     "vectorization_factor = %d, niters = "
@@ -2243,11 +2282,29 @@ start_over:
       || (max_niter != -1
          && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
     {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: iteration count smaller than "
-                        "vectorization factor.\n");
-      return false;
+      /* Allow low trip count for loop epilogue we want to mask.  */
+      if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+       ;
+      /* Allow low trip count for non-epilogue loops if flag is enabled.  */
+      else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+              && flag_tree_vectorize_short_loops)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "iteration count is small, masking is "
+                            "required for chosen vectorization factor.\n");
+
+         LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: iteration count smaller than "
+                            "vectorization factor.\n");
+         return false;
+       }
     }
 
   /* Analyze the alignment of the data-refs in the loop.
@@ -2288,6 +2345,16 @@ start_over:
       return false;
     }
 
+  LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true;
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "vectorizing loop epilogue with masking.\n");
+      LOOP_VINFO_NEED_MASKING (loop_vinfo) = true;
+    }
+
   if (slp)
     {
       /* Analyze operations in the SLP instances.  Note this may
@@ -2311,6 +2378,19 @@ start_over:
       return false;
     }
 
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+    {
+      gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+                 || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo));
+
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: loop cannot be masked.\n");
+
+      return false;
+    }
+
   /* Analyze cost.  Decide if worth while to vectorize.  */
   int min_profitable_estimate, min_profitable_iters;
   int min_profitable_combine_iters;
@@ -2330,8 +2410,9 @@ start_over:
       goto again;
     }
 
-  min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-                           * vectorization_factor) - 1);
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
+                             * vectorization_factor) - 1);
 
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
@@ -2433,18 +2514,28 @@ start_over:
   else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)
           && min_profitable_combine_iters >= 0)
     {
-      if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
-               >= (unsigned) min_profitable_combine_iters))
+      if ((LOOP_VINFO_NEED_MASKING (loop_vinfo)
+          || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+              && (LOOP_VINFO_INT_NITERS (loop_vinfo)
+                  >= (unsigned) min_profitable_combine_iters))
           || estimated_niter == -1
           || estimated_niter >= min_profitable_combine_iters)
-         && dbg_cnt (vect_tail_combine))
+         && (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+             || dbg_cnt (vect_tail_combine)))
        {
          LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
          LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true;
 
-         dump_printf_loc (MSG_NOTE, vect_location,
-                          "Decided to combine loop with its epilogue.\n");
+          if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ())
+           {
+             if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "Decided to vectorize low trip count loop "
+                                "with masking.\n");
+             else
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "Decided to combine loop with its 
epilogue.\n");
+           }
 
          /* We need to adjust profitability check if combine
             epilogue considering additional vector iteration
@@ -2471,6 +2562,22 @@ start_over:
        }
     }
 
+  /* Check for not profitable low trip count loop vectorization.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo)
+      && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+      && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "not vectorized: low trip count loop "
+                        "vectorization is not profitable.\n");
+      return false;
+    }
+
+  if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo)
+      && !dbg_cnt (vect_tail_mask))
+    LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false;
+
   /* Ok to vectorize!  */
   return true;
 
@@ -3421,7 +3528,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, 
int peel_iters_prologue,
                                  si->count * peel_iters_prologue,
                                  si->kind, NULL, si->misalign,
                                  vect_prologue);
-  if (*peel_iters_epilogue)
+  if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo))
     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
       retval += record_stmt_cost (epilogue_cost_vec,
                                  si->count * *peel_iters_epilogue,
@@ -3459,12 +3566,50 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
+  if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+    {
+      /* Currently we don't produce scalar epilogue version in case
+        its masked version is provided.  It means we don't need to
+        compute profitability one more time here.  Just make a
+        masked loop version.  */
+      if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo))
+       {
+         gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo));
+
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "cost model: mask loop epilogue.\n");
+
+         *ret_min_profitable_niters = 0;
+         *ret_min_profitable_estimate = 0;
+         *ret_min_profitable_combine_niters = 0;
+         return;
+       }
+      else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "cost model disabled for epilogue.\n");
+         *ret_min_profitable_niters = 0;
+         *ret_min_profitable_estimate = 0;
+         return;
+       }
+    }
   /* Cost model disabled.  */
-  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
+  else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
       *ret_min_profitable_niters = 0;
       *ret_min_profitable_estimate = 0;
+      *ret_min_profitable_combine_niters = -1;
+
+      if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+       *ret_min_profitable_combine_niters = 0;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+       LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+      else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+              && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo))
+       *ret_min_profitable_combine_niters = 0;
+
       return;
     }
 
@@ -3552,10 +3697,13 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
                                si->count * peel_iters_prologue,
                                si->kind, stmt_info, si->misalign,
                                vect_prologue);
-         (void) add_stmt_cost (target_cost_data,
-                               si->count * peel_iters_epilogue,
-                               si->kind, stmt_info, si->misalign,
-                               vect_epilogue);
+         /* We shouldn't add scalar epilogue cost for low trip
+            count loops which are masked and have no epilogue.  */
+         if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+           (void) add_stmt_cost (target_cost_data,
+                                 si->count * peel_iters_epilogue,
+                                 si->kind, stmt_info, si->misalign,
+                                 vect_epilogue);
        }
     }
   else
@@ -3752,8 +3900,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
               "  Calculated minimum iters for profitability: %d\n",
               min_profitable_iters);
 
-  min_profitable_iters =
-       min_profitable_iters < vf ? vf : min_profitable_iters;
+  /* Adjust to VF for non-masked loops.  */
+  if (!LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    min_profitable_iters = MAX (min_profitable_iters, vf);
 
   /* Because the condition we create is:
      if (niters <= min_profitable_iters)
@@ -3795,6 +3944,25 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
 
   *ret_min_profitable_combine_niters = -1;
 
+  /* Handle low trip count loops.  */
+  if (LOOP_VINFO_NEED_MASKING (loop_vinfo))
+    {
+      /* Masked iteration should be better than a scalar loop:
+        MIC + VIC + MOC < SIC * epilogue_niters  */
+      if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost)
+         >= (scalar_single_iter_cost * peel_iters_epilogue))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Low trip count loop vectorization is not "
+                            "profitable.\n");
+         return;
+       }
+
+      *ret_min_profitable_combine_niters = 0;
+      return;
+    }
+
   /* Don't try to vectorize epilogue of epilogue.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
     return;
@@ -3803,7 +3971,9 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
     {
       if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED)
        {
-         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
+         if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)
+           LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+         else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE)
            *ret_min_profitable_combine_niters = 0;
          return;
        }
@@ -3862,6 +4032,29 @@ vect_estimate_min_profitable_iters (loop_vec_info 
loop_vinfo,
                             profitable_iters);
          *ret_min_profitable_combine_niters = profitable_iters;
        }
+
+      if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK))
+       return;
+
+      /* Now compute profitability for loop epilogue masking.
+        The following condition must hold true:
+        SIC * epilogue_niters + SOC > VIC + MIC + MPC  */
+      int min_profitable_masking_niters
+       = (vec_inside_cost + masking_inside_cost + masking_prologue_cost
+          - scalar_outside_cost) / scalar_single_iter_cost;
+      if (min_profitable_masking_niters > peel_iters_epilogue)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Loop epilogue masking is not pofitable.\n");
+       }
+      else
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Loop epilogue masking is pofitable.\n");
+         LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true;
+       }
     }
 }
 
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 1fc8b65..3361ec1 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -539,7 +539,16 @@ vectorize_loops (void)
        loop->aux = loop_vinfo;
 
        if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
-         continue;
+         {
+           if (loop_vinfo
+               && LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+               && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)
+               && dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "loop epilogue can't be vectorized.\n");
+
+           continue;
+         }
 
         if (!dbg_cnt (vect_loop))
          {

Reply via email to