On Mon, 9 Dec 2024, liuhongt wrote:

> > Please pass 'sbitmap' instead of auto_sbitmap&, it should properly
> > decay to that.  Applies everywhere I think.
> >
> 
> Changed.
> 
> > In fact I wonder whether we should simply populate the bitmap
> > from a
> >
> >   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
> >     bitmap_set_bit (original_innermost, loop->num);
> >
> > loop instead and pass pass-is-cunrolli && bitmap_bit_p (..) as
> > cunrolli down to try_unroll_loop_completely?
> Changed, but still need to pass sbitmap from tree_unroll_loops_completely
> down to canonicalize_induction_variables and tree_unroll_loops_completely_1
> since "outer loop" could already be changed in later 2 callers.
> 
> BTW arm ci reported 2 regressed testcase so I added
> * gcc.dg/tree-ssa/pr83403-1.c: Add --param max-completely-peeled-insns=300 
> for arm*-*-*.
> * gcc.dg/tree-ssa/pr83403-2.c: Ditto.
> 
> For 32-bit arm, there're more stmts in the innermost loop,
> and removal of the reduction prevents completely unrolling for them.
> For aarch64, it looks fine.
> 
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?

OK.

Thanks,
Richard.

> r15-919-gef27b91b62c3aa removed 1 / 3 size reduction for innermost
> loop, but it doesn't accurately remember what's "innermost" for 2
> testcases in PR117888.
> 
> 1) For pass_cunroll, the "innermost" loop could be an originally outer
> loop with inner loop completely unrolled by cunrolli. The patch moves
> local variable cunrolli to parameter of tree_unroll_loops_completely
> and passes it directly from execute of the pass.
> 
> 2) For pass_cunrolli, cunrolli is set to false when the sibling loop
> of a innermost loop is completely unrolled, and it inaccurately
> takes the innermost loop as an "outer" loop. The patch add another
> paramter innermost to helps recognizing the "original" innermost loop.
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/117888
>       * tree-ssa-loop-ivcanon.cc (try_unroll_loop_completely): Use
>       cunrolli instead of cunrolli && !loop->inner to check if it's
>       innermost loop.
>       (canonicalize_loop_induction_variables): Add new parameter
>       const_sbitmap innermost, and pass
>       cunrolli
>       && (unsigned) loop->num < SBITMAP_SIZE (innermost)
>       && bitmap_bit_p (innermost, loop->num) as "cunrolli" to
>       try_unroll_loop_completely
>       (canonicalize_induction_variables): Pass innermost to
>       canonicalize_loop_induction_variables.
>       (tree_unroll_loops_completely_1): Add new parameter
>       const_sbitmap innermost.
>       (tree_unroll_loops_completely): Move local variable cunrolli
>       to parameter to indicate it's from pass cunrolli, also track
>       all "original" innermost loop at the beginning.
> 
> gcc/testsuite/ChangeLog:
> 
>       * gcc.dg/pr117888-2.c: New test.
>       * gcc.dg/vect/pr117888-1.c: Ditto.
>       * gcc.dg/tree-ssa/pr83403-1.c: Add
>       --param max-completely-peeled-insns=300 for arm*-*-*.
>       * gcc.dg/tree-ssa/pr83403-2.c: Ditto.
> ---
>  gcc/testsuite/gcc.dg/pr117888-2.c         | 37 ++++++++++++
>  gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c |  1 +
>  gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c |  1 +
>  gcc/testsuite/gcc.dg/vect/pr117888-1.c    | 71 +++++++++++++++++++++++
>  gcc/tree-ssa-loop-ivcanon.cc              | 60 +++++++++++++------
>  5 files changed, 151 insertions(+), 19 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/pr117888-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr117888-1.c
> 
> diff --git a/gcc/testsuite/gcc.dg/pr117888-2.c 
> b/gcc/testsuite/gcc.dg/pr117888-2.c
> new file mode 100644
> index 00000000000..97aa93d8ace
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/pr117888-2.c
> @@ -0,0 +1,37 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -funroll-loops -fno-tree-vectorize 
> -fdump-tree-cunroll-details" } */
> +
> +typedef struct {
> +  double real;
> +  double imag;
> +} complex;
> +
> +typedef struct { complex e[3][3]; } su3_matrix;
> +
> +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c )
> +{
> +  int i,j;
> +  double t,ar,ai,br,bi,cr,ci;
> +  for(i=0;i<3;i++)
> +    for(j=0;j<3;j++){
> +
> +      ar=a->e[i][0].real; ai=a->e[i][0].imag;
> +      br=b->e[0][j].real; bi=b->e[0][j].imag;
> +      cr=ar*br; t=ai*bi; cr -= t;
> +      ci=ar*bi; t=ai*br; ci += t;
> +
> +      ar=a->e[i][1].real; ai=a->e[i][1].imag;
> +      br=b->e[1][j].real; bi=b->e[1][j].imag;
> +      t=ar*br; cr += t; t=ai*bi; cr -= t;
> +      t=ar*bi; ci += t; t=ai*br; ci += t;
> +
> +      ar=a->e[i][2].real; ai=a->e[i][2].imag;
> +      br=b->e[2][j].real; bi=b->e[2][j].imag;
> +      t=ar*br; cr += t; t=ai*bi; cr -= t;
> +      t=ar*bi; ci += t; t=ai*br; ci += t;
> +
> +      c->e[i][j].real=cr;
> +      c->e[i][j].imag=ci;
> +    }
> +}
> +/* { dg-final { scan-tree-dump-times "optimized: loop with 2 iterations 
> completely unrolled" 1 "cunroll" { target i?86-*-* x86_64-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> index bfc703d1aa6..293fd2dbd97 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-1.c
> @@ -1,6 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
>  /* { dg-additional-options "--param max-completely-peeled-insns=200" { 
> target { s390*-*-* } } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=300" { 
> target { arm*-*-* } } } */
>  
>  #define TYPE unsigned int
>  
> diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c 
> b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> index 9130d9bd583..b421b387bca 100644
> --- a/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr83403-2.c
> @@ -1,6 +1,7 @@
>  /* { dg-do compile } */
>  /* { dg-options "-O3 -funroll-loops -fdump-tree-lim2-details" } */
>  /* { dg-additional-options "--param max-completely-peeled-insns=200" { 
> target { s390*-*-* } } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=300" { 
> target { arm*-*-* } } } */
>  
>  #define TYPE int
>  
> diff --git a/gcc/testsuite/gcc.dg/vect/pr117888-1.c 
> b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
> new file mode 100644
> index 00000000000..4796a7c83c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr117888-1.c
> @@ -0,0 +1,71 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -funroll-loops -fdump-tree-vect-details" } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-require-effective-target vect_shift } */
> +/* { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } } */
> +/* { dg-additional-options "--param max-completely-peeled-insns=200" { 
> target powerpc64*-*-* } } */
> +
> +typedef unsigned short ggml_fp16_t;
> +static float table_f32_f16[1 << 16];
> +
> +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
> +  unsigned short s;
> +  __builtin_memcpy(&s, &f, sizeof(unsigned short));
> +  return table_f32_f16[s];
> +}
> +
> +typedef struct {
> +  ggml_fp16_t d;
> +  ggml_fp16_t m;
> +  unsigned char qh[4];
> +  unsigned char qs[32 / 2];
> +} block_q5_1;
> +
> +typedef struct {
> +  float d;
> +  float s;
> +  char qs[32];
> +} block_q8_1;
> +
> +void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * 
> restrict vx, const void * restrict vy) {
> +  const int qk = 32;
> +  const int nb = n / qk;
> +
> +  const block_q5_1 * restrict x = vx;
> +  const block_q8_1 * restrict y = vy;
> +
> +  float sumf = 0.0;
> +
> +  for (int i = 0; i < nb; i++) {
> +    unsigned qh;
> +    __builtin_memcpy(&qh, x[i].qh, sizeof(qh));
> +
> +    int sumi = 0;
> +
> +    if (qh) {
> +      for (int j = 0; j < qk/2; ++j) {
> +     const unsigned char xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
> +     const unsigned char xh_1 = ((qh >> (j + 12)) ) & 0x10;
> +
> +     const int x0 = (x[i].qs[j] & 0xF) | xh_0;
> +     const int x1 = (x[i].qs[j] >> 4) | xh_1;
> +
> +     sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +      }
> +    }
> +    else {
> +      for (int j = 0; j < qk/2; ++j) {
> +     const int x0 = (x[i].qs[j] & 0xF);
> +     const int x1 = (x[i].qs[j] >>  4);
> +
> +     sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
> +      }
> +    }
> +
> +    sumf += (ggml_lookup_fp16_to_fp32(x[i].d)*y[i].d)*sumi + 
> ggml_lookup_fp16_to_fp32(x[i].m)*y[i].s;
> +  }
> +
> +  *s = sumf;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> diff --git a/gcc/tree-ssa-loop-ivcanon.cc b/gcc/tree-ssa-loop-ivcanon.cc
> index 9a94d82fc4e..451b09adc26 100644
> --- a/gcc/tree-ssa-loop-ivcanon.cc
> +++ b/gcc/tree-ssa-loop-ivcanon.cc
> @@ -939,8 +939,7 @@ try_unroll_loop_completely (class loop *loop,
>            1) It could increase register pressure.
>            2) Big loop after completely unroll may not be vectorized
>            by BB vectorizer.  */
> -       else if ((cunrolli && !loop->inner
> -                 ? unr_insns : unr_insns - est_eliminated)
> +       else if ((cunrolli ? unr_insns : unr_insns - est_eliminated)
>                  > (unsigned) param_max_completely_peeled_insns)
>           {
>             if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1248,7 +1247,9 @@ try_peel_loop (class loop *loop,
>  static bool
>  canonicalize_loop_induction_variables (class loop *loop,
>                                      bool create_iv, enum unroll_level ul,
> -                                    bool try_eval, bool allow_peel, bool 
> cunrolli)
> +                                    bool try_eval, bool allow_peel,
> +                                    const_sbitmap innermost,
> +                                    bool cunrolli)
>  {
>    edge exit = NULL;
>    tree niter;
> @@ -1334,8 +1335,15 @@ canonicalize_loop_induction_variables (class loop 
> *loop,
>    modified |= remove_redundant_iv_tests (loop);
>  
>    dump_user_location_t locus = find_loop_location (loop);
> +
> +  bool innermost_cunrolli_p
> +    = cunrolli
> +      && (unsigned) loop->num < SBITMAP_SIZE (innermost)
> +      && bitmap_bit_p (innermost, loop->num);
> +
>    if (try_unroll_loop_completely (loop, exit, niter, may_be_zero, ul,
> -                               maxiter, locus, allow_peel, cunrolli))
> +                               maxiter, locus, allow_peel,
> +                               innermost_cunrolli_p))
>      return true;
>  
>    if (create_iv
> @@ -1372,14 +1380,19 @@ canonicalize_induction_variables (void)
>    bool changed = false;
>    bool irred_invalidated = false;
>    bitmap loop_closed_ssa_invalidated = BITMAP_ALLOC (NULL);
> +  auto_sbitmap innermost (number_of_loops (cfun));
> +  bitmap_clear (innermost);
>  
>    estimate_numbers_of_iterations (cfun);
>  
>    for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
>      {
> -      changed |= canonicalize_loop_induction_variables (loop,
> -                                                     true, UL_SINGLE_ITER,
> -                                                     true, false, false);
> +      changed
> +     |= canonicalize_loop_induction_variables (loop,
> +                                               true, UL_SINGLE_ITER,
> +                                               true, false,
> +                                               (const_sbitmap) innermost,
> +                                               false);
>      }
>    gcc_assert (!need_ssa_update_p (cfun));
>  
> @@ -1413,7 +1426,8 @@ canonicalize_induction_variables (void)
>  
>  static bool
>  tree_unroll_loops_completely_1 (bool may_increase_size, bool unroll_outer,
> -                             bitmap father_bbs, class loop *loop, bool 
> cunrolli)
> +                             bitmap father_bbs, class loop *loop,
> +                             const_sbitmap innermost, bool cunrolli)
>  {
>    class loop *loop_father;
>    bool changed = false;
> @@ -1431,7 +1445,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
> bool unroll_outer,
>       if (!child_father_bbs)
>         child_father_bbs = BITMAP_ALLOC (NULL);
>       if (tree_unroll_loops_completely_1 (may_increase_size, unroll_outer,
> -                                         child_father_bbs, inner, cunrolli))
> +                                         child_father_bbs, inner,
> +                                         innermost, cunrolli))
>         {
>           bitmap_ior_into (father_bbs, child_father_bbs);
>           bitmap_clear (child_father_bbs);
> @@ -1477,7 +1492,8 @@ tree_unroll_loops_completely_1 (bool may_increase_size, 
> bool unroll_outer,
>      ul = UL_NO_GROWTH;
>  
>    if (canonicalize_loop_induction_variables
> -      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer, cunrolli))
> +      (loop, false, ul, !flag_tree_loop_ivcanon, unroll_outer,
> +       innermost, cunrolli))
>      {
>        /* If we'll continue unrolling, we need to propagate constants
>        within the new basic blocks to fold away induction variable
> @@ -1503,19 +1519,28 @@ tree_unroll_loops_completely_1 (bool 
> may_increase_size, bool unroll_outer,
>  
>  /* Unroll LOOPS completely if they iterate just few times.  Unless
>     MAY_INCREASE_SIZE is true, perform the unrolling only if the
> -   size of the code does not increase.  */
> +   size of the code does not increase.
> +   cunrolli is true when passs is cunrolli.  */
>  
>  static unsigned int
> -tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer)
> +tree_unroll_loops_completely (bool may_increase_size, bool unroll_outer, 
> bool cunrolli)
>  {
>    bitmap father_bbs = BITMAP_ALLOC (NULL);
>    bool changed;
>    int iteration = 0;
>    bool irred_invalidated = false;
> -  bool cunrolli = true;
> +  auto_sbitmap innermost (number_of_loops (cfun));
> +  bitmap_clear (innermost);
>  
>    estimate_numbers_of_iterations (cfun);
>  
> +  /* Mark all innermost loop at the begining.  */
> +  for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
> +    {
> +      if (!loop->inner)
> +     bitmap_set_bit (innermost, loop->num);
> +    }
> +
>    do
>      {
>        changed = false;
> @@ -1530,14 +1555,11 @@ tree_unroll_loops_completely (bool may_increase_size, 
> bool unroll_outer)
>        changed = tree_unroll_loops_completely_1 (may_increase_size,
>                                               unroll_outer, father_bbs,
>                                               current_loops->tree_root,
> +                                             (const_sbitmap) innermost,
>                                               cunrolli);
>        if (changed)
>       {
>         unsigned i;
> -       /* For the outer loop, considering that the inner loop is completely
> -          unrolled, it would expose more optimization opportunities, so it's
> -          better to keep 2/3 reduction of estimated unrolled size.  */
> -       cunrolli = false;
>  
>         unloop_loops (loops_to_unloop, loops_to_unloop_nunroll,
>                       edges_to_remove, loop_closed_ssa_invalidated,
> @@ -1697,7 +1719,7 @@ pass_complete_unroll::execute (function *fun)
>       re-peeling the same loop multiple times.  */
>    if (flag_peel_loops)
>      peeled_loops = BITMAP_ALLOC (NULL);
> -  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
> true);
> +  unsigned int val = tree_unroll_loops_completely (flag_cunroll_grow_size, 
> true, false);
>    if (peeled_loops)
>      {
>        BITMAP_FREE (peeled_loops);
> @@ -1753,7 +1775,7 @@ pass_complete_unrolli::execute (function *fun)
>    if (number_of_loops (fun) > 1)
>      {
>        scev_initialize ();
> -      ret = tree_unroll_loops_completely (optimize >= 3, false);
> +      ret = tree_unroll_loops_completely (optimize >= 3, false, true);
>        scev_finalize ();
>      }
>    loop_optimizer_finalize ();
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to