On Wed, Nov 27, 2024 at 4:26 AM liuhongt <hongtao....@intel.com> wrote:
>
> When loop requires any kind of versioning which could increase register
> pressure too much, and it's in a deeply nest big loop, don't do
> vectorization.
>
> I tested the patch with both Ofast and O2 for SPEC2017, besides 
> 548.exchange_r,
> other benchmarks are same binary.
>
> Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,}
> Any comments?

The vectorizer tries to version an outer loop when vectorizing a loop nest
and the versioning condition is invariant.  See vect_loop_versioning.  This
tries to handle such cases.  Often the generated runtime alias checks are
not invariant because we do not consider the outer evolutions.  I think we
should instead fix this there.

Question below ...

> gcc/ChangeLog:
>
>         pr target/117088
>         * config/i386/i386.cc
>         (ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function.
>         (ix86_vector_costs::finish_cost): Prevent loop vectorization
>         if it's in a deeply nested loop and require versioning.
>         * config/i386/i386.opt (--param=vect-max-loop-depth=): New
>         param.
> ---
>  gcc/config/i386/i386.cc  | 89 ++++++++++++++++++++++++++++++++++++++++
>  gcc/config/i386/i386.opt |  4 ++
>  2 files changed, 93 insertions(+)
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 526c9df7618..608f40413d2 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -25019,6 +25019,8 @@ private:
>
>    /* Estimate register pressure of the vectorized code.  */
>    void ix86_vect_estimate_reg_pressure ();
> +  /* Check if vect_loop is in a deeply-nested loop.  */
> +  bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop);
>    /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
>       estimation of register pressure.
>       ??? Currently it's only used by vec_construct/scalar_to_vec
> @@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
>      }
>  }
>
> +/* Return true if vect_loop is in a deeply-nested loop.
> +   .i.e vect_loop_n in below loop structure.
> +loop1
> +{
> + loop2
> + {
> +  loop3
> +  {
> +   vect_loop_1;
> +   loop4
> +   {
> +    vect_loop_2;
> +    loop5
> +    {
> +     vect_loop_3;
> +     loop6
> +     {
> +      vect_loop_4;
> +      loop7
> +      {
> +       vect_loop_5;
> +       loop8
> +       {
> +       loop9
> +       }
> +      vect_loop_6;
> +      }
> +     vect_loop_7;
> +     }
> +    }
> +   }
> + }
> + It's a big hammer to fix O2 regression for 548.exchange_r after 
> vectorization
> + is enhanced by (r15-4225-g70c3db511ba14f)  */
> +bool
> +ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop)
> +{
> +  if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth)
> +    return true;
> +
> +  if (loop_depth (vect_loop) < 2)
> +    return false;
> +

while the above two are "obvious", what you check below isn't clear to me.
Is this trying to compute whether 'vect_loop' is inside of a loop nest which
at any sibling of vect_loop (or even sibling of an outer loop of vect_loop,
recursively) is a sub-nest with a loop depth (relative to what?) exceeds
ix86_vect_max_loop_depth?

> +  class loop* outer_loop = loop_outer (vect_loop);
> +
> +  auto_vec<class loop*> m_loop_stack;
> +  auto_sbitmap m_visited_loops (number_of_loops (cfun));
> +
> +  /* Get all sibling loops for vect_loop.  */
> +  class loop* next_loop = outer_loop->inner;
> +  for (; next_loop; next_loop = next_loop->next)
> +    {
> +      m_loop_stack.safe_push (next_loop);
> +      bitmap_set_bit (m_visited_loops, next_loop->num);
> +    }
> +
> +  /* DFS the max depth of all sibling loop.  */
> +  while (!m_loop_stack.is_empty ())
> +    {
> +      next_loop = m_loop_stack.pop ();
> +      if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth)
> +       return true;
> +
> +      class loop* inner_loop = next_loop->inner;
> +      while (inner_loop)
> +       {
> +         if (!bitmap_bit_p (m_visited_loops, inner_loop->num))
> +           {
> +             m_loop_stack.safe_push (inner_loop);
> +             bitmap_set_bit (m_visited_loops, inner_loop->num);
> +           }
> +         inner_loop = inner_loop->next;
> +       }
> +    }
> +
> +  return false;
> +}
> +
>  void
>  ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
>  {
> @@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs 
> *scalar_costs)
>           && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
>               > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
>         m_costs[vect_body] = INT_MAX;
> +
> +      /* Prohibit vectorization when the loop requires versioning
> +        and loop_depth exceeds threshold.  */
> +      if ((LOOP_REQUIRES_VERSIONING (loop_vinfo)
> +          || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
> +          || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
> +          || vect_apply_runtime_profitability_check_p (loop_vinfo))

this also applies to loops not requiring versioning - it practically applies to
all loops that do not run for a constant VF-times times.

> +         && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo)))
> +       m_costs[vect_body] = INT_MAX;
>      }
>
>    ix86_vect_estimate_reg_pressure ();
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index 99e86f545e8..c5abf83473d 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation.
>  mamx-movrs
>  Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save
>  Support AMX-MOVRS built-in functions and code generation.
> +
> +-param=vect-max-loop-depth=
> +Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param
> +Preversion loop vectorization when it's in a deeply nested loop and requires 
> versioning, since it may increase register pressure too much.
> --
> 2.34.1
>

Reply via email to