On Thu, May 19, 2016 at 9:46 PM, Ilya Enkovich <enkovich....@gmail.com> wrote: > Hi, > > This patch enables vectorization of loop epilogues and low trip count > loops using masking.
I wonder why we have the epilogue masking restriction with respect to the original vectorization factor - shouldn't this simply be handled by vectorizing the epilogue? First trying the original VF (requires masking and is equivalent to low-tripcount loop vectorization), then if that is not profitable iterate to smaller VFs? [yes, ideally we'd be able to compare cost for vectorization with different VFs and choose the best VF] Thanks, Richard. > Thanks, > Ilya > -- > gcc/ > > 2016-05-19 Ilya Enkovich <ilya.enkov...@intel.com> > > * dbgcnt.def (vect_tail_mask): New. > * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop > epilogues and low trip count loops. > (vect_get_known_peeling_cost): Ignore scalat epilogue cost for > loops we are going to mask. > (vect_estimate_min_profitable_iters): Support masked loop > epilogues and low trip count loops. > * tree-vectorizer.c (vectorize_loops): Add a message for a case > when loop epilogue can't be vectorized. > > > diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def > index 73c2966..5aad1d7 100644 > --- a/gcc/dbgcnt.def > +++ b/gcc/dbgcnt.def > @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra) > DEBUG_COUNTER (vect_loop) > DEBUG_COUNTER (vect_slp) > DEBUG_COUNTER (vect_tail_combine) > +DEBUG_COUNTER (vect_tail_mask) > DEBUG_COUNTER (dom_unreachable_edges) > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index 1a80c42..7075f29 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool > &fatal) > int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > HOST_WIDE_INT estimated_niter; > unsigned th; > - int min_scalar_loop_bound; > + int min_scalar_loop_bound = 0; > > /* Check the SLP opportunities in the loop, analyze and build SLP trees. > */ > ok = vect_analyze_slp (loop_vinfo, n_stmts); > @@ -2224,6 +2224,30 @@ start_over: > unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > gcc_assert (vectorization_factor != 0); > > + /* For now we mask loop epilogue using the same VF since it was used > + for cost estimations and it should be easier for reduction > + optimization. */ > + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) > + && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != > (int)vectorization_factor) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "not vectorized: VF for loop epilogue doesn't " > + "match original loop VF.\n"); > + return false; > + } > + > + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) > + && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= > (int)vectorization_factor) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "not vectorized: VF for loop epilogue is too > small\n"); > + return false; > + } > + > if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) > dump_printf_loc (MSG_NOTE, vect_location, > "vectorization_factor = %d, niters = " > @@ -2237,11 +2261,29 @@ start_over: > || (max_niter != -1 > && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor)) > { > - if (dump_enabled_p ()) > - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > - "not vectorized: iteration count smaller than " > - "vectorization factor.\n"); > - return false; > + /* Allow low trip count for loop epilogue we want to mask. */ > + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) > + ; > + /* Allow low trip count for non-epilogue loops if flag is enabled. */ > + else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && flag_tree_vectorize_short_loops) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "iteration count is small, masking is " > + "required for chosen vectorization factor.\n"); > + > + LOOP_VINFO_NEED_MASKING (loop_vinfo) = true; > + } > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "not vectorized: iteration count smaller than " > + "vectorization factor.\n"); > + return false; > + } > } > > /* Analyze the alignment of the data-refs in the loop. > @@ -2282,6 +2324,16 @@ start_over: > return false; > } > > + LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true; > + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "vectorizing loop epilogue with masking.\n"); > + LOOP_VINFO_NEED_MASKING (loop_vinfo) = true; > + } > + > if (slp) > { > /* Analyze operations in the SLP instances. Note this may > @@ -2305,6 +2357,19 @@ start_over: > return false; > } > > + if (LOOP_VINFO_NEED_MASKING (loop_vinfo) > + && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) > + { > + gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)); > + > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "not vectorized: loop cannot be masked.\n"); > + > + return false; > + } > + > /* Analyze cost. Decide if worth while to vectorize. */ > int min_profitable_estimate, min_profitable_iters; > int min_profitable_combine_iters; > @@ -2324,8 +2389,9 @@ start_over: > goto again; > } > > - min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) > - * vectorization_factor) - 1); > + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) > + * vectorization_factor) - 1); > > /* Use the cost model only if it is more conservative than user specified > threshold. */ > @@ -2425,18 +2491,28 @@ start_over: > else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) > && min_profitable_combine_iters >= 0) > { > - if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > - && (LOOP_VINFO_INT_NITERS (loop_vinfo) > - >= (unsigned) min_profitable_combine_iters)) > + if ((LOOP_VINFO_NEED_MASKING (loop_vinfo) > + || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) > + && (LOOP_VINFO_INT_NITERS (loop_vinfo) > + >= (unsigned) min_profitable_combine_iters)) > || estimated_niter == -1 > || estimated_niter >= min_profitable_combine_iters) > - && dbg_cnt (vect_tail_combine)) > + && (LOOP_VINFO_NEED_MASKING (loop_vinfo) > + || dbg_cnt (vect_tail_combine))) > { > LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false; > LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true; > > - dump_printf_loc (MSG_NOTE, vect_location, > - "Decided to combine loop with its epilogue.\n"); > + if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ()) > + { > + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Decided to vectorize low trip count loop " > + "with masking.\n"); > + else > + dump_printf_loc (MSG_NOTE, vect_location, > + "Decided to combine loop with its > epilogue.\n"); > + } > > /* We need to adjust profitability check if combine > epilogue considering additional vector iteration > @@ -2463,6 +2539,22 @@ start_over: > } > } > > + /* Check for not profitable low trip count loop vectorization. */ > + if (LOOP_VINFO_NEED_MASKING (loop_vinfo) > + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "not vectorized: low trip count loop " > + "vectorization is not profitable.\n"); > + return false; > + } > + > + if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) > + && !dbg_cnt (vect_tail_mask)) > + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false; > + > /* Ok to vectorize! */ > return true; > > @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, > int peel_iters_prologue, > si->count * peel_iters_prologue, > si->kind, NULL, si->misalign, > vect_prologue); > - if (*peel_iters_epilogue) > + if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo)) > FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) > retval += record_stmt_cost (epilogue_cost_vec, > si->count * *peel_iters_epilogue, > @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); > void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); > > + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) > + { > + /* Currently we don't produce scalar epilogue version in case > + its masked version is provided. It means we don't need to > + compute profitability one more time here. Just make a > + masked loop version. */ > + if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) > + { > + gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)); > + > + dump_printf_loc (MSG_NOTE, vect_location, > + "cost model: mask loop epilogue.\n"); > + > + *ret_min_profitable_niters = 0; > + *ret_min_profitable_estimate = 0; > + *ret_min_profitable_combine_niters = 0; > + return; > + } > + else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED) > + { > + dump_printf_loc (MSG_NOTE, vect_location, > + "cost model disabled for epilogue.\n"); > + *ret_min_profitable_niters = 0; > + *ret_min_profitable_estimate = 0; > + return; > + } > + } > /* Cost model disabled. */ > - if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) > + else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) > { > dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); > *ret_min_profitable_niters = 0; > *ret_min_profitable_estimate = 0; > + *ret_min_profitable_combine_niters = -1; > + > + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + *ret_min_profitable_combine_niters = 0; > + else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK) > + && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) > + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; > + else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) > + && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) > + *ret_min_profitable_combine_niters = 0; > + > return; > } > > @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > si->count * peel_iters_prologue, > si->kind, stmt_info, si->misalign, > vect_prologue); > - (void) add_stmt_cost (target_cost_data, > - si->count * peel_iters_epilogue, > - si->kind, stmt_info, si->misalign, > - vect_epilogue); > + /* We shouldn't add scalar epilogue cost for low trip > + count loops which are masked and have no epilogue. */ > + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + (void) add_stmt_cost (target_cost_data, > + si->count * peel_iters_epilogue, > + si->kind, stmt_info, si->misalign, > + vect_epilogue); > } > } > else > @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > " Calculated minimum iters for profitability: %d\n", > min_profitable_iters); > > - min_profitable_iters = > - min_profitable_iters < vf ? vf : min_profitable_iters; > + /* Adjust to VF for non-masked loops. */ > + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + min_profitable_iters = MAX (min_profitable_iters, vf); > > /* Because the condition we create is: > if (niters <= min_profitable_iters) > @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > > *ret_min_profitable_combine_niters = -1; > > + /* Handle low trip count loops. */ > + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) > + { > + /* Masked iteration should be better than a scalar loop: > + MIC + VIC + MOC < SIC * epilogue_niters */ > + if ((int)(masking_inside_cost + masking_prologue_cost + > vec_inside_cost) > + >= (scalar_single_iter_cost * peel_iters_epilogue)) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Low trip count loop vectorization is not " > + "profitable.\n"); > + return; > + } > + > + *ret_min_profitable_combine_niters = 0; > + return; > + } > + > /* Don't try to vectorize epilogue of epilogue. */ > if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) > return; > @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > { > if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED) > { > - if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) > + if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK) > + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; > + else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) > *ret_min_profitable_combine_niters = 0; > return; > } > @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > profitable_iters); > *ret_min_profitable_combine_niters = profitable_iters; > } > + > + if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)) > + return; > + > + /* Now compute profitability for loop epilogue masking. > + The following condition must hold true: > + SIC * epilogue_niters + SOC > VIC + MIC + MPC */ > + int min_profitable_masking_niters > + = (vec_inside_cost + masking_inside_cost + masking_prologue_cost > + - scalar_outside_cost) / scalar_single_iter_cost; > + if (min_profitable_masking_niters > peel_iters_epilogue) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Loop epilogue masking is not pofitable.\n"); > + } > + else > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_NOTE, vect_location, > + "Loop epilogue masking is pofitable.\n"); > + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; > + } > } > } > > diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c > index 5f15246..f70aed6 100644 > --- a/gcc/tree-vectorizer.c > +++ b/gcc/tree-vectorizer.c > @@ -539,7 +539,16 @@ vectorize_loops (void) > loop->aux = loop_vinfo; > > if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo)) > - continue; > + { > + if (loop_vinfo > + && LOOP_VINFO_EPILOGUE_P (loop_vinfo) > + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) > + && dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "loop epilogue can't be vectorized.\n"); > + > + continue; > + } > > if (!dbg_cnt (vect_loop)) > {