Hi, This patch enables vectorization of loop epilogues and low trip count loops using masking.
Thanks, Ilya -- gcc/ 2016-05-19 Ilya Enkovich <ilya.enkov...@intel.com> * dbgcnt.def (vect_tail_mask): New. * tree-vect-loop.c (vect_analyze_loop_2): Support masked loop epilogues and low trip count loops. (vect_get_known_peeling_cost): Ignore scalat epilogue cost for loops we are going to mask. (vect_estimate_min_profitable_iters): Support masked loop epilogues and low trip count loops. * tree-vectorizer.c (vectorize_loops): Add a message for a case when loop epilogue can't be vectorized. diff --git a/gcc/dbgcnt.def b/gcc/dbgcnt.def index 73c2966..5aad1d7 100644 --- a/gcc/dbgcnt.def +++ b/gcc/dbgcnt.def @@ -193,4 +193,5 @@ DEBUG_COUNTER (tree_sra) DEBUG_COUNTER (vect_loop) DEBUG_COUNTER (vect_slp) DEBUG_COUNTER (vect_tail_combine) +DEBUG_COUNTER (vect_tail_mask) DEBUG_COUNTER (dom_unreachable_edges) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 1a80c42..7075f29 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2199,7 +2199,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal) int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); HOST_WIDE_INT estimated_niter; unsigned th; - int min_scalar_loop_bound; + int min_scalar_loop_bound = 0; /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ ok = vect_analyze_slp (loop_vinfo, n_stmts); @@ -2224,6 +2224,30 @@ start_over: unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); gcc_assert (vectorization_factor != 0); + /* For now we mask loop epilogue using the same VF since it was used + for cost estimations and it should be easier for reduction + optimization. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) + && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) != (int)vectorization_factor) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: VF for loop epilogue doesn't " + "match original loop VF.\n"); + return false; + } + + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) + && LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo) <= (int)vectorization_factor) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: VF for loop epilogue is too small\n"); + return false; + } + if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "vectorization_factor = %d, niters = " @@ -2237,11 +2261,29 @@ start_over: || (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor)) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: iteration count smaller than " - "vectorization factor.\n"); - return false; + /* Allow low trip count for loop epilogue we want to mask. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) + ; + /* Allow low trip count for non-epilogue loops if flag is enabled. */ + else if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && flag_tree_vectorize_short_loops) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "iteration count is small, masking is " + "required for chosen vectorization factor.\n"); + + LOOP_VINFO_NEED_MASKING (loop_vinfo) = true; + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: iteration count smaller than " + "vectorization factor.\n"); + return false; + } } /* Analyze the alignment of the data-refs in the loop. @@ -2282,6 +2324,16 @@ start_over: return false; } + LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) = true; + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "vectorizing loop epilogue with masking.\n"); + LOOP_VINFO_NEED_MASKING (loop_vinfo) = true; + } + if (slp) { /* Analyze operations in the SLP instances. Note this may @@ -2305,6 +2357,19 @@ start_over: return false; } + if (LOOP_VINFO_NEED_MASKING (loop_vinfo) + && !LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) + { + gcc_assert (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) + || !LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)); + + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: loop cannot be masked.\n"); + + return false; + } + /* Analyze cost. Decide if worth while to vectorize. */ int min_profitable_estimate, min_profitable_iters; int min_profitable_combine_iters; @@ -2324,8 +2389,9 @@ start_over: goto again; } - min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) - * vectorization_factor) - 1); + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) + min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) + * vectorization_factor) - 1); /* Use the cost model only if it is more conservative than user specified threshold. */ @@ -2425,18 +2491,28 @@ start_over: else if (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo) && min_profitable_combine_iters >= 0) { - if (((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && (LOOP_VINFO_INT_NITERS (loop_vinfo) - >= (unsigned) min_profitable_combine_iters)) + if ((LOOP_VINFO_NEED_MASKING (loop_vinfo) + || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && (LOOP_VINFO_INT_NITERS (loop_vinfo) + >= (unsigned) min_profitable_combine_iters)) || estimated_niter == -1 || estimated_niter >= min_profitable_combine_iters) - && dbg_cnt (vect_tail_combine)) + && (LOOP_VINFO_NEED_MASKING (loop_vinfo) + || dbg_cnt (vect_tail_combine))) { LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false; LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo) = true; - dump_printf_loc (MSG_NOTE, vect_location, - "Decided to combine loop with its epilogue.\n"); + if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) && dump_enabled_p ()) + { + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) + dump_printf_loc (MSG_NOTE, vect_location, + "Decided to vectorize low trip count loop " + "with masking.\n"); + else + dump_printf_loc (MSG_NOTE, vect_location, + "Decided to combine loop with its epilogue.\n"); + } /* We need to adjust profitability check if combine epilogue considering additional vector iteration @@ -2463,6 +2539,22 @@ start_over: } } + /* Check for not profitable low trip count loop vectorization. */ + if (LOOP_VINFO_NEED_MASKING (loop_vinfo) + && !LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && !LOOP_VINFO_COMBINE_EPILOGUE (loop_vinfo)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "not vectorized: low trip count loop " + "vectorization is not profitable.\n"); + return false; + } + + if (LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) + && !dbg_cnt (vect_tail_mask)) + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = false; + /* Ok to vectorize! */ return true; @@ -3413,7 +3505,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue, si->count * peel_iters_prologue, si->kind, NULL, si->misalign, vect_prologue); - if (*peel_iters_epilogue) + if (*peel_iters_epilogue && !LOOP_VINFO_NEED_MASKING (loop_vinfo)) FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si) retval += record_stmt_cost (epilogue_cost_vec, si->count * *peel_iters_epilogue, @@ -3451,12 +3543,50 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + { + /* Currently we don't produce scalar epilogue version in case + its masked version is provided. It means we don't need to + compute profitability one more time here. Just make a + masked loop version. */ + if (LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo)) + { + gcc_assert (LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)); + + dump_printf_loc (MSG_NOTE, vect_location, + "cost model: mask loop epilogue.\n"); + + *ret_min_profitable_niters = 0; + *ret_min_profitable_estimate = 0; + *ret_min_profitable_combine_niters = 0; + return; + } + else if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED) + { + dump_printf_loc (MSG_NOTE, vect_location, + "cost model disabled for epilogue.\n"); + *ret_min_profitable_niters = 0; + *ret_min_profitable_estimate = 0; + return; + } + } /* Cost model disabled. */ - if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) + else if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) { dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); *ret_min_profitable_niters = 0; *ret_min_profitable_estimate = 0; + *ret_min_profitable_combine_niters = -1; + + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) + *ret_min_profitable_combine_niters = 0; + else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK) + && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; + else if ((flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) + && LOOP_VINFO_CAN_BE_MASKED (loop_vinfo)) + *ret_min_profitable_combine_niters = 0; + return; } @@ -3544,10 +3674,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, si->count * peel_iters_prologue, si->kind, stmt_info, si->misalign, vect_prologue); - (void) add_stmt_cost (target_cost_data, - si->count * peel_iters_epilogue, - si->kind, stmt_info, si->misalign, - vect_epilogue); + /* We shouldn't add scalar epilogue cost for low trip + count loops which are masked and have no epilogue. */ + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) + (void) add_stmt_cost (target_cost_data, + si->count * peel_iters_epilogue, + si->kind, stmt_info, si->misalign, + vect_epilogue); } } else @@ -3744,8 +3877,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, " Calculated minimum iters for profitability: %d\n", min_profitable_iters); - min_profitable_iters = - min_profitable_iters < vf ? vf : min_profitable_iters; + /* Adjust to VF for non-masked loops. */ + if (!LOOP_VINFO_NEED_MASKING (loop_vinfo)) + min_profitable_iters = MAX (min_profitable_iters, vf); /* Because the condition we create is: if (niters <= min_profitable_iters) @@ -3787,6 +3921,25 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, *ret_min_profitable_combine_niters = -1; + /* Handle low trip count loops. */ + if (LOOP_VINFO_NEED_MASKING (loop_vinfo)) + { + /* Masked iteration should be better than a scalar loop: + MIC + VIC + MOC < SIC * epilogue_niters */ + if ((int)(masking_inside_cost + masking_prologue_cost + vec_inside_cost) + >= (scalar_single_iter_cost * peel_iters_epilogue)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Low trip count loop vectorization is not " + "profitable.\n"); + return; + } + + *ret_min_profitable_combine_niters = 0; + return; + } + /* Don't try to vectorize epilogue of epilogue. */ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) return; @@ -3795,7 +3948,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, { if (flag_vect_epilogue_cost_model == VECT_COST_MODEL_UNLIMITED) { - if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) + if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK) + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; + else if (flag_tree_vectorize_epilogues & VECT_EPILOGUE_COMBINE) *ret_min_profitable_combine_niters = 0; return; } @@ -3854,6 +4009,29 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, profitable_iters); *ret_min_profitable_combine_niters = profitable_iters; } + + if (!(flag_tree_vectorize_epilogues & VECT_EPILOGUE_MASK)) + return; + + /* Now compute profitability for loop epilogue masking. + The following condition must hold true: + SIC * epilogue_niters + SOC > VIC + MIC + MPC */ + int min_profitable_masking_niters + = (vec_inside_cost + masking_inside_cost + masking_prologue_cost + - scalar_outside_cost) / scalar_single_iter_cost; + if (min_profitable_masking_niters > peel_iters_epilogue) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop epilogue masking is not pofitable.\n"); + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Loop epilogue masking is pofitable.\n"); + LOOP_VINFO_MASK_EPILOGUE (loop_vinfo) = true; + } } } diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 5f15246..f70aed6 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -539,7 +539,16 @@ vectorize_loops (void) loop->aux = loop_vinfo; if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo)) - continue; + { + if (loop_vinfo + && LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && LOOP_VINFO_ORIG_MASK_EPILOGUE (loop_vinfo) + && dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "loop epilogue can't be vectorized.\n"); + + continue; + } if (!dbg_cnt (vect_loop)) {