On Fri, 22 Jun 2018, Richard Biener wrote: > > The following makes the vectorizer consider all vector sizes as advertised > by targetm.vectorize.autovectorize_vector_sizes and decide on which > vector size to use based on costs. > > Given comparing costs is difficult if you do not know the number of > scalar iterations the patch simply uses the cost of a single vector > iteration (weighted by vectorization factor) to decide which variant > is better. For this we compute this cost also for -fno-vect-cost-model > (so even with that you'll get the vectorizer choose between sizes) > and store it in a new LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST. > > Otherwise this is straight-forward and doesn't really depend on > dataref/ddr analysis sharing (but that makes it less costly). > > Bootstrap / regtest pending on x86_64-unknown-linux-gnu.
I have committed 1/3 and 2/3 now, for the following I have done benchmarks on Haswell with SPEC CPU 2006 and results are in the noise (as expected). So with -Ofast -march=native I see the following number of vectorized loops: AVX256 AVX128 unpatched 13124 1367 patched 12893 1598 which means a slight shift towards AVX128 for whatever (unanalyzed) reason. As amendmend for the patch I am considering to keep -fno-vect-cost-model behavior the same as now - choose the first successful vectorization. diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 0c2ab57bb20..8e3f5f550b0 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2336,6 +2336,10 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, loop_vinfos.safe_push (std::make_pair (loop_vinfo, current_vector_size)); + /* For -fno-vect-cost-model do as in the past, choose the + first successful vectorization. */ + if (unlimited_cost_model (loop)) + break; } else delete loop_vinfo; the patch already bootstrapped and tested fine on x86_64-unknown-linux-gnu without the above hunk (the vectorizer testsuite mostly uses -fno-vect-cost-model). Richard. > Richard. > > From 4234213fcd5e6fe844bc1171938e12ab189ef290 Mon Sep 17 00:00:00 2001 > From: Richard Guenther <rguent...@suse.de> > Date: Thu, 21 Jun 2018 13:40:14 +0200 > Subject: [PATCH] try-multiple-vector-sizes-and-compare-costs > > 2018-06-22 Richard Biener <rguent...@suse.de> > > * tree-vectorizer.h (struct _loop_vec_info): Add > single_vector_iteration_cost member. > (LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST): New. > * tree-vect-loop.c (_loop_vec_info::~_loop_vec_info): Do not > reset loop->aux here. > (vect_analyze_loop_form): Do not assert or set loop->aux here. > (vect_analyze_loop): Iterate over all vector sizes and decide > based on the vector iteration cost which one to use. > (vect_estimate_min_profitable_iters): Move check for > unlimited cost model later to not skip cost computation or > dumping. Set LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST. > (vect_create_epilog_for_reduction): Use loop_vinfo rather > than loop_vec_info_for_loop. > > diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c > index dacc8811636..0c2ab57bb20 100644 > --- a/gcc/tree-vect-loop.c > +++ b/gcc/tree-vect-loop.c > @@ -947,8 +947,6 @@ _loop_vec_info::~_loop_vec_info () > > release_vec_loop_masks (&masks); > delete ivexpr_map; > - > - loop->aux = NULL; > } > > /* Return an invariant or register for EXPR and emit necessary > @@ -1395,8 +1393,6 @@ vect_analyze_loop_form (struct loop *loop, > vec_info_shared *shared) > STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond)) > = loop_exit_ctrl_vec_info_type; > > - gcc_assert (!loop->aux); > - loop->aux = loop_vinfo; > return loop_vinfo; > } > > @@ -2285,7 +2281,6 @@ loop_vec_info > vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo, > vec_info_shared *shared) > { > - loop_vec_info loop_vinfo; > auto_vector_sizes vector_sizes; > > /* Autodetect first vector size we try. */ > @@ -2316,11 +2311,12 @@ vect_analyze_loop (struct loop *loop, loop_vec_info > orig_loop_vinfo, > } > > unsigned n_stmts; > + auto_vec<std::pair<loop_vec_info, poly_uint64> > loop_vinfos; > poly_uint64 autodetected_vector_size = 0; > while (1) > { > /* Check the CFG characteristics of the loop (nesting, entry/exit). */ > - loop_vinfo = vect_analyze_loop_form (loop, shared); > + loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared); > if (!loop_vinfo) > { > if (dump_enabled_p ()) > @@ -2338,10 +2334,24 @@ vect_analyze_loop (struct loop *loop, loop_vec_info > orig_loop_vinfo, > { > LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; > > - return loop_vinfo; > + loop_vinfos.safe_push (std::make_pair (loop_vinfo, > + current_vector_size)); > } > + else > + delete loop_vinfo; > > - delete loop_vinfo; > + /* If all vector sizes are going to fail or if we didn't end up > + with any vector fail. */ > + if (fatal > + || known_eq (current_vector_size, 0U)) > + { > + /* With gathers/scatters we can end up claiming dataref analysis > + failed for one but not the other vector size so we can't > + really trust "fatal" here and assert loop_vinfos has length > + zero. Still stop looking for other vector sizes since that > + is what we've done before. */ > + break; > + } > > if (next_size == 0) > autodetected_vector_size = current_vector_size; > @@ -2350,10 +2360,8 @@ vect_analyze_loop (struct loop *loop, loop_vec_info > orig_loop_vinfo, > && known_eq (vector_sizes[next_size], autodetected_vector_size)) > next_size += 1; > > - if (fatal > - || next_size == vector_sizes.length () > - || known_eq (current_vector_size, 0U)) > - return NULL; > + if (next_size == vector_sizes.length ()) > + break; > > /* Try the next biggest vector size. */ > current_vector_size = vector_sizes[next_size++]; > @@ -2366,6 +2374,30 @@ vect_analyze_loop (struct loop *loop, loop_vec_info > orig_loop_vinfo, > dump_printf (MSG_NOTE, "\n"); > } > } > + > + if (loop_vinfos.is_empty ()) > + return NULL; > + > + /* Go over vinfos, selecting the one with best cost. */ > + loop_vec_info loop_vinfo = loop_vinfos[0].first; > + current_vector_size = loop_vinfos[0].second; > + for (unsigned i = 1; i < loop_vinfos.length (); ++i) > + if (known_lt (LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST > (loop_vinfos[i].first) > + * LOOP_VINFO_VECT_FACTOR (loop_vinfo), > + LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST (loop_vinfo) > + * LOOP_VINFO_VECT_FACTOR (loop_vinfos[i].first))) > + { > + /* ??? Just comparing the vector iteration cost possibly isn't > + the best thing to do. */ > + delete loop_vinfo; > + loop_vinfo = loop_vinfos[i].first; > + current_vector_size = loop_vinfos[i].second; > + } > + else > + delete loop_vinfos[i].first; > + > + set_stmt_vec_info_vec (&loop_vinfo->stmt_vec_infos); > + return loop_vinfo; > } > > /* Return true if there is an in-order reduction function for CODE, storing > @@ -3432,15 +3464,6 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo); > void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo); > > - /* Cost model disabled. */ > - if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) > - { > - dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); > - *ret_min_profitable_niters = 0; > - *ret_min_profitable_estimate = 0; > - return; > - } > - > /* Requires loop versioning tests to handle misalignment. */ > if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) > { > @@ -3693,6 +3716,7 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost, > &vec_inside_cost, &vec_epilogue_cost); > > + LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST (loop_vinfo) = vec_inside_cost; > vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost); > > if (dump_enabled_p ()) > @@ -3716,6 +3740,17 @@ vect_estimate_min_profitable_iters (loop_vec_info > loop_vinfo, > peel_iters_epilogue); > } > > + /* Cost model disabled. We still do the above work to be able to > + compute a vector iteration cost and provide cost analysis in the > + dumps. */ > + if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo))) > + { > + dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n"); > + *ret_min_profitable_niters = 0; > + *ret_min_profitable_estimate = 0; > + return; > + } > + > /* Calculate number of iterations required to make the vector version > profitable, relative to the loop bodies only. The following condition > must hold true: > @@ -5751,8 +5786,7 @@ vect_finalize_reduction: > > /* Create vector phi node. */ > vect_phi = create_phi_node (vec_initial_def, bb); > - new_phi_vinfo = new_stmt_vec_info (vect_phi, > - loop_vec_info_for_loop (outer_loop)); > + new_phi_vinfo = new_stmt_vec_info (vect_phi, loop_vinfo); > set_vinfo_for_stmt (vect_phi, new_phi_vinfo); > > /* Create vs0 - initial def of the double reduction phi. > */ > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index 55f8e6e4407..e93f40ba73f 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -468,6 +468,9 @@ typedef struct _loop_vec_info : public vec_info { > /* Cost of a single scalar iteration. */ > int single_scalar_iteration_cost; > > + /* Cost of a single vector iteration. */ > + int single_vector_iteration_cost; > + > /* Is the loop vectorizable? */ > bool vectorizable; > > @@ -571,6 +574,7 @@ typedef struct _loop_vec_info : public vec_info { > #define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store > #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec > #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) > (L)->single_scalar_iteration_cost > +#define LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST(L) > (L)->single_vector_iteration_cost > #define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info > > #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \ > -- Richard Biener <rguent...@suse.de> SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)