[PATCH] [x86] [RFC] Prevent loop vectorization if it's in a deeply nested big loop.

liuhongt Tue, 26 Nov 2024 19:25:46 -0800

When loop requires any kind of versioning which could increase register
pressure too much, and it's in a deeply nest big loop, don't do
vectorization.


I tested the patch with both Ofast and O2 for SPEC2017, besides 548.exchange_r,
other benchmarks are same binary.

Bootstrapped and regtested 0on x86_64-pc-linux-gnu{-m32,}
Any comments?

gcc/ChangeLog:

        pr target/117088
        * config/i386/i386.cc
        (ix86_vector_costs::ix86_vect_in_deep_nested_loop_p): New function.
        (ix86_vector_costs::finish_cost): Prevent loop vectorization
        if it's in a deeply nested loop and require versioning.
        * config/i386/i386.opt (--param=vect-max-loop-depth=): New
        param.
---
 gcc/config/i386/i386.cc  | 89 ++++++++++++++++++++++++++++++++++++++++
 gcc/config/i386/i386.opt |  4 ++
 2 files changed, 93 insertions(+)

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 526c9df7618..608f40413d2 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -25019,6 +25019,8 @@ private:
 
   /* Estimate register pressure of the vectorized code.  */
   void ix86_vect_estimate_reg_pressure ();
+  /* Check if vect_loop is in a deeply-nested loop.  */
+  bool ix86_vect_in_deep_nested_loop_p (class loop *vect_loop);
   /* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
      estimation of register pressure.
      ??? Currently it's only used by vec_construct/scalar_to_vec
@@ -25324,6 +25326,84 @@ ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
     }
 }
 
+/* Return true if vect_loop is in a deeply-nested loop.
+   .i.e vect_loop_n in below loop structure.
+loop1
+{
+ loop2
+ {
+  loop3
+  {
+   vect_loop_1;
+   loop4
+   {
+    vect_loop_2;
+    loop5
+    {
+     vect_loop_3;
+     loop6
+     {
+      vect_loop_4;
+      loop7
+      {
+       vect_loop_5;
+       loop8
+       {
+       loop9
+       }
+      vect_loop_6;
+      }
+     vect_loop_7;
+     }
+    }
+   }
+ }
+ It's a big hammer to fix O2 regression for 548.exchange_r after vectorization
+ is enhanced by (r15-4225-g70c3db511ba14f)  */
+bool
+ix86_vector_costs::ix86_vect_in_deep_nested_loop_p (class loop *vect_loop)
+{
+  if (loop_depth (vect_loop) > (unsigned) ix86_vect_max_loop_depth)
+    return true;
+
+  if (loop_depth (vect_loop) < 2)
+    return false;
+
+  class loop* outer_loop = loop_outer (vect_loop);
+
+  auto_vec<class loop*> m_loop_stack;
+  auto_sbitmap m_visited_loops (number_of_loops (cfun));
+
+  /* Get all sibling loops for vect_loop.  */
+  class loop* next_loop = outer_loop->inner;
+  for (; next_loop; next_loop = next_loop->next)
+    {
+      m_loop_stack.safe_push (next_loop);
+      bitmap_set_bit (m_visited_loops, next_loop->num);
+    }
+
+  /* DFS the max depth of all sibling loop.  */
+  while (!m_loop_stack.is_empty ())
+    {
+      next_loop = m_loop_stack.pop ();
+      if (loop_depth (next_loop) > (unsigned) ix86_vect_max_loop_depth)
+       return true;
+
+      class loop* inner_loop = next_loop->inner;
+      while (inner_loop)
+       {
+         if (!bitmap_bit_p (m_visited_loops, inner_loop->num))
+           {
+             m_loop_stack.safe_push (inner_loop);
+             bitmap_set_bit (m_visited_loops, inner_loop->num);
+           }
+         inner_loop = inner_loop->next;
+       }
+    }
+
+  return false;
+}
+
 void
 ix86_vector_costs::finish_cost (const vector_costs *scalar_costs)
 {
@@ -25344,6 +25424,15 @@ ix86_vector_costs::finish_cost (const vector_costs 
*scalar_costs)
          && (exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ())
              > ceil_log2 (LOOP_VINFO_INT_NITERS (loop_vinfo))))
        m_costs[vect_body] = INT_MAX;
+
+      /* Prohibit vectorization when the loop requires versioning
+        and loop_depth exceeds threshold.  */
+      if ((LOOP_REQUIRES_VERSIONING (loop_vinfo)
+          || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+          || LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+          || vect_apply_runtime_profitability_check_p (loop_vinfo))
+         && ix86_vect_in_deep_nested_loop_p (LOOP_VINFO_LOOP (loop_vinfo)))
+       m_costs[vect_body] = INT_MAX;
     }
 
   ix86_vect_estimate_reg_pressure ();
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index 99e86f545e8..c5abf83473d 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1414,3 +1414,7 @@ Support MOVRS built-in functions and code generation.
 mamx-movrs
 Target Mask(ISA2_AMX_MOVRS) Var(ix86_isa_flags2) Save
 Support AMX-MOVRS built-in functions and code generation.
+
+-param=vect-max-loop-depth=
+Target Joined UInteger Var(ix86_vect_max_loop_depth) Init(8) Param
+Preversion loop vectorization when it's in a deeply nested loop and requires 
versioning, since it may increase register pressure too much.
-- 
2.34.1

[PATCH] [x86] [RFC] Prevent loop vectorization if it's in a deeply nested big loop.

Reply via email to