On Wed, Nov 27, 2024 at 4:26 AM liuhongt <hongtao....@intel.com> wrote: > > When loop requires any kind of versioning which could increase register > pressure too much, and it's in a deeply nest big loop, don't do > vectorization. > > I tested the patch with both Ofast and O2 for SPEC2017, besides > 548.exchange_r, > other benchmarks are same binary. > > Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,} > Any comments?
The vectorizer tries to version an outer loop when vectorizing a loop nest and the versioning condition is invariant. See vect_loop_versioning. This tries to handle such cases. Often the generated runtime alias checks are not invariant because we do not consider the outer evolutions. I think we should instead fix this there. Question below ... > gcc/ChangeLog: > > pr target/117088 > * config/i386/i386.cc > (ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function. > (ix86_vector_costs::finish_cost): Prevent loop vectorization > if it's in a deeply nested loop and require versioning. > * config/i386/i386.opt (--param=vect-max-loop-depth=): New > param. > --- > gcc/config/i386/i386.cc | 89 ++++++++++++++++++++++++++++++++++++++++ > gcc/config/i386/i386.opt | 4 ++ > 2 files changed, 93 insertions(+) > > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index 526c9df7618..608f40413d2 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -25019,6 +25019,8 @@ private: > > /* Estimate register pressure of the vectorized code. */ > void ix86_vect_estimate_reg_pressure (); > + /* Check if vect_loop is in a deeply-nested loop. */ > + bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop); > /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for > estimation of register pressure. > ??? Currently it's only used by vec_construct/scalar_to_vec > @@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure () > } > } > > +/* Return true if vect_loop is in a deeply-nested loop. > + .i.e vect_loop_n in below loop structure. > +loop1 > +{ > + loop2 > + { > + loop3 > + { > + vect_loop_1; > + loop4 > + { > + vect_loop_2; > + loop5 > + { > + vect_loop_3; > + loop6 > + { > + vect_loop_4; > + loop7 > + { > + vect_loop_5; > + loop8 > + { > + loop9 > + } > + vect_loop_6; > + } > + vect_loop_7; > + } > + } > + } > + } > + It's a big hammer to fix O2 regression for 548.exchange_r after > vectorization > + is enhanced by (r15-4225-g70c3db511ba14f) */ > +bool > +ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop) > +{ > + if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth) > + return true; > + > + if (loop_depth (vect_loop) < 2) > + return false; > + while the above two are "obvious", what you check below isn't clear to me. Is this trying to compute whether 'vect_loop' is inside of a loop nest which at any sibling of vect_loop (or even sibling of an outer loop of vect_loop, recursively) is a sub-nest with a loop depth (relative to what?) exceeds ix86_vect_max_loop_depth? > + class loop* outer_loop = loop_outer (vect_loop); > + > + auto_vec<class loop*> m_loop_stack; > + auto_sbitmap m_visited_loops (number_of_loops (cfun)); > + > + /* Get all sibling loops for vect_loop. */ > + class loop* next_loop = outer_loop->inner; > + for (; next_loop; next_loop = next_loop->next) > + { > + m_loop_stack.safe_push (next_loop); > + bitmap_set_bit (m_visited_loops, next_loop->num); > + } > + > + /* DFS the max depth of all sibling loop. */ > + while (!m_loop_stack.is_empty ()) > + { > + next_loop = m_loop_stack.pop (); > + if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth) > + return true; > + > + class loop* inner_loop = next_loop->inner; > + while (inner_loop) > + { > + if (!bitmap_bit_p (m_visited_loops, inner_loop->num)) > + { > + m_loop_stack.safe_push (inner_loop); > + bitmap_set_bit (m_visited_loops, inner_loop->num); > + } > + inner_loop = inner_loop->next; > + } > + } > + > + return false; > +} > + > void > ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) > { > @@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs > *scalar_costs) > && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) > m_costs[vect_body] = INT_MAX; > + > + /* Prohibit vectorization when the loop requires versioning > + and loop_depth exceeds threshold. */ > + if ((LOOP_REQUIRES_VERSIONING (loop_vinfo) > + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) > + || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > + || vect_apply_runtime_profitability_check_p (loop_vinfo)) this also applies to loops not requiring versioning - it practically applies to all loops that do not run for a constant VF-times times. > + && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo))) > + m_costs[vect_body] = INT_MAX; > } > > ix86_vect_estimate_reg_pressure (); > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt > index 99e86f545e8..c5abf83473d 100644 > --- a/gcc/config/i386/i386.opt > +++ b/gcc/config/i386/i386.opt > @@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation. > mamx-movrs > Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save > Support AMX-MOVRS built-in functions and code generation. > + > +-param=vect-max-loop-depth= > +Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param > +Preversion loop vectorization when it's in a deeply nested loop and requires > versioning, since it may increase register pressure too much. > -- > 2.34.1 >