When loop requires any kind of versioning which could increase register pressure too much, and it's in a deeply nest big loop, don't do vectorization.
I tested the patch with both Ofast and O2 for SPEC2017, besides 548.exchange_r, other benchmarks are same binary. Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,} Any comments? gcc/ChangeLog: pr target/117088 * config/i386/i386.cc (ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function. (ix86_vector_costs::finish_cost): Prevent loop vectorization if it's in a deeply nested loop and require versioning. * config/i386/i386.opt (--param=vect-max-loop-depth=): New param. --- gcc/config/i386/i386.cc | 89 ++++++++++++++++++++++++++++++++++++++++ gcc/config/i386/i386.opt | 4 ++ 2 files changed, 93 insertions(+) diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index 526c9df7618..608f40413d2 100644 --- a/gcc/config/i386/i386.cc +++ b/gcc/config/i386/i386.cc @@ -25019,6 +25019,8 @@ private: /* Estimate register pressure of the vectorized code. */ void ix86_vect_estimate_reg_pressure (); + /* Check if vect_loop is in a deeply-nested loop. */ + bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop); /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for estimation of register pressure. ??? Currently it's only used by vec_construct/scalar_to_vec @@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure () } } +/* Return true if vect_loop is in a deeply-nested loop. + .i.e vect_loop_n in below loop structure. +loop1 +{ + loop2 + { + loop3 + { + vect_loop_1; + loop4 + { + vect_loop_2; + loop5 + { + vect_loop_3; + loop6 + { + vect_loop_4; + loop7 + { + vect_loop_5; + loop8 + { + loop9 + } + vect_loop_6; + } + vect_loop_7; + } + } + } + } + It's a big hammer to fix O2 regression for 548.exchange_r after vectorization + is enhanced by (r15-4225-g70c3db511ba14f) */ +bool +ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop) +{ + if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth) + return true; + + if (loop_depth (vect_loop) < 2) + return false; + + class loop* outer_loop = loop_outer (vect_loop); + + auto_vec<class loop*> m_loop_stack; + auto_sbitmap m_visited_loops (number_of_loops (cfun)); + + /* Get all sibling loops for vect_loop. */ + class loop* next_loop = outer_loop->inner; + for (; next_loop; next_loop = next_loop->next) + { + m_loop_stack.safe_push (next_loop); + bitmap_set_bit (m_visited_loops, next_loop->num); + } + + /* DFS the max depth of all sibling loop. */ + while (!m_loop_stack.is_empty ()) + { + next_loop = m_loop_stack.pop (); + if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth) + return true; + + class loop* inner_loop = next_loop->inner; + while (inner_loop) + { + if (!bitmap_bit_p (m_visited_loops, inner_loop->num)) + { + m_loop_stack.safe_push (inner_loop); + bitmap_set_bit (m_visited_loops, inner_loop->num); + } + inner_loop = inner_loop->next; + } + } + + return false; +} + void ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) { @@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs *scalar_costs) && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ()) > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo)))) m_costs[vect_body] = INT_MAX; + + /* Prohibit vectorization when the loop requires versioning + and loop_depth exceeds threshold. */ + if ((LOOP_REQUIRES_VERSIONING (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) + || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) + || vect_apply_runtime_profitability_check_p (loop_vinfo)) + && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo))) + m_costs[vect_body] = INT_MAX; } ix86_vect_estimate_reg_pressure (); diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt index 99e86f545e8..c5abf83473d 100644 --- a/gcc/config/i386/i386.opt +++ b/gcc/config/i386/i386.opt @@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation. mamx-movrs Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save Support AMX-MOVRS built-in functions and code generation. + +-param=vect-max-loop-depth= +Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param +Preversion loop vectorization when it's in a deeply nested loop and requires versioning, since it may increase register pressure too much. -- 2.34.1