[PATCH] Fix inaccuracy in cunroll/cunrolli when considering what's innermost loop.

liuhongt Thu, 05 Dec 2024 18:29:07 -0800

r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost
loop, but it doesn't accurately remember what's "innermost" for 2
testcases in PR117888.


1) For pass_cunroll, the "innermost" loop could be an originally outer
loop with inner loop completely unrolled by cunrolli. The patch moves
local variable cunrolli to parameter of tree_unroll_loops_completely
and passes it directly from execute of the pass.

2) For pass_cunrolli, cunrolli is set to false when the sibling loop
of a innermost loop is completely unrolled, and it inaccurately
takes the innermost loop as an "outer" loop. The patch add another
paramter visited_innermost to helps recognizing the innermost loop.

Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?

gcc/ChangeLog:

        PR tree-optimization/117888
        * tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Add
        a new parameter visited_innermost to avoid inaccurately taking
        innermost as an outer loop after it's sibling loop is
        completed unrolled.
        (canonicalize_loop_induction_variables): Ditto.
        (canonicalize_induction_variables): Ditto.
        (tree_unroll_loops_completely_1):  Ditto.
        (tree_unroll_loops_completely): Move local variable cunrolli
        to parameter to indicate it's from pass cunrolli.

gcc/testsuite/ChangeLog:

        * gcc.dg/pr117888-2.c: New test.
        * gcc.dg/vect/pr117888-1.c: Ditto.
---
 gcc/testsuite/gcc.dg/pr117888-2.c      | 38 ++++++++++++++
 gcc/testsuite/gcc.dg/vect/pr117888-1.c | 71 ++++++++++++++++++++++++++
 gcc/tree-ssa-loop-ivcanon.cc           | 50 +++++++++++++-----
 3 files changed, 145 insertions(+), 14 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr117888-2.c
 create mode 100644 gcc/testsuite/gcc.dg/vect/pr117888-1.c

diff --git a/gcc/testsuite/gcc.dg/pr117888-2.c 
b/gcc/testsuite/gcc.dg/pr117888-2.c
new file mode 100644
index 00000000000..1749f1509a6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr117888-2.c
@@ -0,0 +1,38 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize 
-fdump-tree-cunroll-details" } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target 
powerpc64*-*-* } } */
+
+typedef struct {
+  double real;
+  double imag;
+} complex;
+
+typedef struct { complex e[3][3]; } su3_matrix;
+
+void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
+{
+  int i,j;
+  double t,ar,ai,br,bi,cr,ci;
+  for(i=0;i<3;i++)
+    for(j=0;j<3;j++){
+
+      ar=a->e[i][0].real; ai=a->e[i][0].imag;
+      br=b->e[0][j].real; bi=b->e[0][j].imag;
+      cr=ar*br; t=ai*bi; cr -= t;
+      ci=ar*bi; t=ai*br; ci += t;
+
+      ar=a->e[i][1].real; ai=a->e[i][1].imag;
+      br=b->e[1][j].real; bi=b->e[1][j].imag;
+      t=ar*br; cr += t; t=ai*bi; cr -= t;
+      t=ar*bi; ci += t; t=ai*br; ci += t;
+
+      ar=a->e[i][2].real; ai=a->e[i][2].imag;
+      br=b->e[2][j].real; bi=b->e[2][j].imag;
+      t=ar*br; cr += t; t=ai*bi; cr -= t;
+      t=ar*bi; ci += t; t=ai*br; ci += t;
+
+      c->e[i][j].real=cr;
+      c->e[i][j].imag=ci;
+    }
+}
+/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations 
completely unrolled" 1 "cunroll" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr117888-1.c 
b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
new file mode 100644
index 00000000000..4796a7c83c1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
@@ -0,0 +1,71 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
+/* { dg-additional-options "--param max-completely-peeled-insns=200" { target 
powerpc64*-*-* } } */
+
+typedef unsigned short ggml_fp16_t;
+static float table_f32_f16[1 << 16];
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+  unsigned short s;
+  __builtin_memcpy(&s, &f, sizeof(unsigned short));
+  return table_f32_f16[s];
+}
+
+typedef struct {
+  ggml_fp16_t d;
+  ggml_fp16_t m;
+  unsigned char qh[4];
+  unsigned char qs[32 / 2];
+} block_q5_1;
+
+typedef struct {
+  float d;
+  float s;
+  char qs[32];
+} block_q8_1;
+
+void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
restrict vx, const void * restrict vy) {
+  const int qk = 32;
+  const int nb = n / qk;
+
+  const block_q5_1 * restrict x = vx;
+  const block_q8_1 * restrict y = vy;
+
+  float sumf = 0.0;
+
+  for (int i = 0; i < nb; i++) {
+    unsigned qh;
+    __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
+
+    int sumi = 0;
+
+    if (qh) {
+      for (int j = 0; j < qk/2; ++j) {
+       const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
+       const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
+
+       const int x0 = (x[i].qs[j] & 0xF) | xh_0;
+       const int x1 = (x[i].qs[j] >> 4) | xh_1;
+
+       sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+      }
+    }
+    else {
+      for (int j = 0; j < qk/2; ++j) {
+       const int x0 = (x[i].qs[j] & 0xF);
+       const int x1 = (x[i].qs[j] >>  4);
+
+       sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
+      }
+    }
+
+    sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
+  }
+
+  *s = sumf;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
index 9a94d82fc4e..f3ed3f9dc51 100644
--- a/gcc/tree-ssa-loop-ivcanon.cc
+++ b/gcc/tree-ssa-loop-ivcanon.cc
@@ -747,7 +747,7 @@ try_unroll_loop_completely (class loop *loop,
                            enum unroll_level ul,
                            HOST_WIDE_INT maxiter,
                            dump_user_location_t locus, bool allow_peel,
-                           bool cunrolli)
+                           auto_sbitmap& visited_innermost, bool cunrolli)
 {
   unsigned HOST_WIDE_INT n_unroll = 0;
   bool n_unroll_found = false;
@@ -939,7 +939,15 @@ try_unroll_loop_completely (class loop *loop,
             1) It could increase register pressure.
             2) Big loop after completely unroll may not be vectorized
             by BB vectorizer.  */
-         else if ((cunrolli && !loop->inner
+         else if ((!loop->inner
+                   /* It's a visited innermost, not a outer loop whose
+                      inner loops is completely unrolled or loops added
+                      by the recursive calls because SSA form is not
+                      up-to-date.  */
+                   && (unsigned) loop->num
+                       < SBITMAP_SIZE ((sbitmap) visited_innermost)
+                   && (cunrolli
+                       || bitmap_bit_p (visited_innermost, loop->num))
                    ? unr_insns : unr_insns - est_eliminated)
                   > (unsigned) param_max_completely_peeled_insns)
            {
@@ -948,6 +956,9 @@ try_unroll_loop_completely (class loop *loop,
                         "number of insns in the unrolled sequence reaches "
                         "--param max-completely-peeled-insns limit.\n",
                         loop->num);
+             if ((unsigned) loop->num
+                 < SBITMAP_SIZE ((sbitmap) visited_innermost))
+               bitmap_set_bit (visited_innermost, loop->num);
              return false;
            }
        }
@@ -1248,7 +1259,9 @@ try_peel_loop (class loop *loop,
 static bool
 canonicalize_loop_induction_variables (class loop *loop,
                                       bool create_iv, enum unroll_level ul,
-                                      bool try_eval, bool allow_peel, bool 
cunrolli)
+                                      bool try_eval, bool allow_peel,
+                                      auto_sbitmap& visited_innermost,
+                                      bool cunrolli)
 {
   edge exit = NULL;
   tree niter;
@@ -1335,7 +1348,8 @@ canonicalize_loop_induction_variables (class loop *loop,
 
   dump_user_location_t locus = find_loop_location (loop);
   if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
-                                 maxiter, locus, allow_peel, cunrolli))
+                                 maxiter, locus, allow_peel,
+                                 visited_innermost, cunrolli))
     return true;
 
   if (create_iv
@@ -1372,6 +1386,8 @@ canonicalize_induction_variables (void)
   bool changed = false;
   bool irred_invalidated = false;
   bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL);
+  auto_sbitmap visited_innermost (number_of_loops (cfun));
+  bitmap_clear (visited_innermost);
 
   estimate_numbers_of_iterations (cfun);
 
@@ -1379,7 +1395,8 @@ canonicalize_induction_variables (void)
     {
       changed |= canonicalize_loop_induction_variables (loop,
                                                        true, UL_SINGLE_ITER,
-                                                       true, false, false);
+                                                       true, false,
+                                                       visited_innermost, 
false);
     }
   gcc_assert (!need_ssa_update_p (cfun));
 
@@ -1413,7 +1430,8 @@ canonicalize_induction_variables (void)
 
 static bool
 tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
-                               bitmap father_bbs, class loop *loop, bool 
cunrolli)
+                               bitmap father_bbs, class loop *loop,
+                               auto_sbitmap& visited_innermost, bool cunrolli)
 {
   class loop *loop_father;
   bool changed = false;
@@ -1431,7 +1449,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
bool unroll_outer,
        if (!child_father_bbs)
          child_father_bbs = BITMAP_ALLOC (NULL);
        if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
-                                           child_father_bbs, inner, cunrolli))
+                                           child_father_bbs, inner,
+                                           visited_innermost, cunrolli))
          {
            bitmap_ior_into (father_bbs, child_father_bbs);
            bitmap_clear (child_father_bbs);
@@ -1477,7 +1496,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
bool unroll_outer,
     ul = UL_NO_GROWTH;
 
   if (canonicalize_loop_induction_variables
-      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
+      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer,
+       visited_innermost, cunrolli))
     {
       /* If we'll continue unrolling, we need to propagate constants
         within the new basic blocks to fold away induction variable
@@ -1503,16 +1523,18 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
bool unroll_outer,
 
 /* Unroll LOOPS completely if they iterate just few times.  Unless
    MAY_INCREASE_SIZE is true, perform the unrolling only if the
-   size of the code does not increase.  */
+   size of the code does not increase.
+   cunrolli is true when passs is cunrolli.  */
 
 static unsigned int
-tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
+tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer, bool 
cunrolli)
 {
   bitmap father_bbs = BITMAP_ALLOC (NULL);
   bool changed;
   int iteration = 0;
   bool irred_invalidated = false;
-  bool cunrolli = true;
+  auto_sbitmap visited_innermost (number_of_loops (cfun));
+  bitmap_clear (visited_innermost);
 
   estimate_numbers_of_iterations (cfun);
 
@@ -1530,7 +1552,7 @@ tree_unroll_loops_completely (bool may_increase_size, 
bool unroll_outer)
       changed = tree_unroll_loops_completely_1 (may_increase_size,
                                                unroll_outer, father_bbs,
                                                current_loops->tree_root,
-                                               cunrolli);
+                                               visited_innermost, cunrolli);
       if (changed)
        {
          unsigned i;
@@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun)
      re-peeling the same loop multiple times.  */
   if (flag_peel_loops)
     peeled_loops = BITMAP_ALLOC (NULL);
-  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
true);
+  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
true, false);
   if (peeled_loops)
     {
       BITMAP_FREE (peeled_loops);
@@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun)
   if (number_of_loops (fun) > 1)
     {
       scev_initialize ();
-      ret = tree_unroll_loops_completely (optimize >= 3, false);
+      ret = tree_unroll_loops_completely (optimize >= 3, false, true);
       scev_finalize ();
     }
   loop_optimizer_finalize ();
-- 
2.34.1

[PATCH] Fix inaccuracy in cunroll/cunrolli when considering what's innermost loop.

Reply via email to