On 17/11/15 15:53, Tom de Vries wrote:
And the above LIM example
is none for why you need two LIM passes...

Indeed. I'm planning a separate reply to explain in more detail the need
for the two pass_lims.

I.

I managed to get rid of the two pass_lims for the motivating example that I used until now (goacc/kernels-double-reduction.c). I found that by adding a pass_dominator instance after pass_ch, I could get rid of the second pass_lim (and pass_copyprop as well).

But... then I wrote a counter example (goacc/kernels-double-reduction-n.c), and I'm back at two pass_lims (and two pass_dominators).
Also I've split the pass group into a bit before and after pass_fre.

So, the current pass group looks like:
...
NEXT_PASS (pass_build_ealias);

/* Pass group that runs when the function is an offloaded function
   containing oacc kernels loops.  Part 1.  */
NEXT_PASS (pass_oacc_kernels);
PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels)
    /* We need pass_ch here, because pass_lim has no effect on
       exit-first loops (PR65442).  Ideally we want to remove both
       this pass instantiation, and the reverse transformation
       transform_to_exit_first_loop_alt, which is done in
       pass_parallelize_loops_oacc_kernels. */
    NEXT_PASS (pass_ch);
POP_INSERT_PASSES ()

NEXT_PASS (pass_fre);

/* Pass group that runs when the function is an offloaded function
   containing oacc kernels loops.  Part 2.  */
NEXT_PASS (pass_oacc_kernels2);
PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels2)
    /* We use pass_lim to rewrite in-memory iteration and reduction
       variable accesses in loops into local variables accesses.  */
    NEXT_PASS (pass_lim);
    NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
    NEXT_PASS (pass_lim);
    NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
    NEXT_PASS (pass_dce);
    NEXT_PASS (pass_parallelize_loops_oacc_kernels);
    NEXT_PASS (pass_expand_omp_ssa);
POP_INSERT_PASSES ()
NEXT_PASS (pass_merge_phi);
...


II.

The motivating test-case kernels-double-reduction-n.c:
...
#include <stdlib.h>

#define N 500

unsigned int a[N][N];

void  __attribute__((noinline,noclone))
foo (unsigned int n)
{
  int i, j;
  unsigned int sum = 1;

#pragma acc kernels copyin (a[0:n]) copy (sum)
  {
    for (i = 0; i < n; ++i)
      for (j = 0; j < n; ++j)
        sum += a[i][j];
  }

  if (sum != 5001)
    abort ();
}
...


III.

Before first pass_lim. Note no phis on inner or outer loop header for iteration varables or reduction variable:
...
  <bb 2>:
  _5 = *.omp_data_i_4(D).i;
  *_5 = 0;
  _44 = *.omp_data_i_4(D).n;
  _45 = *_44;
  if (_45 != 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 4>: outer loop header
  _12 = *.omp_data_i_4(D).j;
  *_12 = 0;
  if (_45 != 0)
    goto <bb 6>;
  else
    goto <bb 5>;

  <bb 6>: inner loop header, latch
  _19 = *.omp_data_i_4(D).a;
  _21 = *_5;
  _23 = *_12;
  _24 = *_19[_21][_23];
  _25 = *.omp_data_i_4(D).sum;
  sum.0_26 = *_25;
  sum.1_27 = _24 + sum.0_26;
  *_25 = sum.1_27;
  _33 = _23 + 1;
  *_12 = _33;
  j.2_16 = (unsigned int) _33;
  if (j.2_16 < _45)
    goto <bb 6>;
  else
    goto <bb 5>;

  <bb 5>: outer loop latch
  _36 = *_5;
  _38 = _36 + 1;
  *_5 = _38;
  i.3_9 = (unsigned int) _38;
  if (i.3_9 < _45)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 3>:
  return;
...


IV.

After first pass_lim/pass_dom pair. Note there are phis on the inner loop header for the reduction and the iteration variable, but not on the outer loop header:
...
  <bb 2>:
  _5 = *.omp_data_i_4(D).i;
  *_5 = 0;
  _44 = *.omp_data_i_4(D).n;
  _45 = *_44;
  if (_45 != 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 4>:
  _12 = *.omp_data_i_4(D).j;
  _19 = *.omp_data_i_4(D).a;
  D__lsm.10_50 = *_12;
  D__lsm.11_51 = 0;
  _25 = *.omp_data_i_4(D).sum;

  <bb 5>: outer loop header
  D__lsm.10_20 = 0;
  D__lsm.11_22 = 1;
  _21 = *_5;
  D__lsm.12_28 = *_25;
  D__lsm.13_30 = 0;
  goto <bb 7>;

  <bb 7>: inner loop header, latch
  # D__lsm.10_47 = PHI <0(5), _33(7)>
  # D__lsm.12_49 = PHI <D__lsm.12_28(5), sum.1_27(7)>
  _23 = D__lsm.10_47;
  _24 = *_19[_21][D__lsm.10_47];
  sum.0_26 = D__lsm.12_49;
  sum.1_27 = _24 + D__lsm.12_49;
  D__lsm.12_31 = sum.1_27;
  D__lsm.13_32 = 1;
  _33 = D__lsm.10_47 + 1;
  D__lsm.10_14 = _33;
  D__lsm.11_15 = 1;
  j.2_16 = (unsigned int) _33;
  if (j.2_16 < _45)
    goto <bb 7>;
  else
    goto <bb 8>;

  <bb 8>: outer loop latch
  # D__lsm.10_35 = PHI <_33(7)>
  # D__lsm.11_37 = PHI <1(7)>
  # D__lsm.12_7 = PHI <sum.1_27(7)>
  # D__lsm.13_8 = PHI <1(7)>
  *_25 = sum.1_27;
  _36 = *_5;
  _38 = _36 + 1;
  *_5 = _38;
  i.3_9 = (unsigned int) _38;
  if (i.3_9 < _45)
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 6>:
  # D__lsm.10_10 = PHI <_33(8)>
  # D__lsm.11_11 = PHI <1(8)>
  *_12 = _33;
  goto <bb 3>;

  <bb 3>:
  return;
...


V.

After second pass_lim/pass_dom pair. Note there are phis on the inner and outer loop header for the reduction and the iteration variables:
...
  <bb 2>:
  _5 = *.omp_data_i_4(D).i;
  *_5 = 0;
  _44 = *.omp_data_i_4(D).n;
  _45 = *_44;
  if (_45 != 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 4>:
  _12 = *.omp_data_i_4(D).j;
  _19 = *.omp_data_i_4(D).a;
  D__lsm.10_50 = *_12;
  D__lsm.11_51 = 0;
  _25 = *.omp_data_i_4(D).sum;
  D__lsm.14_40 = 0;
  D__lsm.15_2 = 0;
  D__lsm.16_1 = *_25;
  D__lsm.17_46 = 0;

  <bb 5>: outer loop header
  # D__lsm.14_13 = PHI <0(4), _38(8)>
  # D__lsm.16_34 = PHI <D__lsm.16_1(4), sum.1_27(8)>
  D__lsm.10_20 = 0;
  D__lsm.11_22 = 1;
  _21 = D__lsm.14_13;
  D__lsm.12_28 = D__lsm.16_34;
  D__lsm.13_30 = 0;
  goto <bb 7>;

  <bb 7>: inner loop header, latch
  # D__lsm.10_47 = PHI <0(5), _33(7)>
  # D__lsm.12_49 = PHI <D__lsm.16_34(5), sum.1_27(7)>
  _23 = D__lsm.10_47;
  _24 = *_19[D__lsm.14_13][D__lsm.10_47];
  sum.0_26 = D__lsm.12_49;
  sum.1_27 = _24 + D__lsm.12_49;
  D__lsm.12_31 = sum.1_27;
  D__lsm.13_32 = 1;
  _33 = D__lsm.10_47 + 1;
  D__lsm.10_14 = _33;
  D__lsm.11_15 = 1;
  j.2_16 = (unsigned int) _33;
  if (j.2_16 < _45)
    goto <bb 7>;
  else
    goto <bb 8>;

  <bb 8>: outer loop latch
  # D__lsm.10_35 = PHI <_33(7)>
  # D__lsm.11_37 = PHI <1(7)>
  # D__lsm.12_7 = PHI <sum.1_27(7)>
  # D__lsm.13_8 = PHI <1(7)>
  # sum.1_48 = PHI <sum.1_27(7)>
  # _53 = PHI <_33(7)>
  D__lsm.16_56 = sum.1_27;
  D__lsm.17_57 = 1;
  _36 = D__lsm.14_13;
  _38 = D__lsm.14_13 + 1;
  D__lsm.14_58 = _38;
  D__lsm.15_59 = 1;
  i.3_9 = (unsigned int) _38;
  if (i.3_9 < _45)
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 6>:
  # D__lsm.10_10 = PHI <_33(8)>
  # D__lsm.11_11 = PHI <1(8)>
  # _43 = PHI <_33(8)>
  # D__lsm.16_62 = PHI <sum.1_27(8)>
  # D__lsm.17_63 = PHI <1(8)>
  # D__lsm.14_64 = PHI <_38(8)>
  # D__lsm.15_65 = PHI <1(8)>
  *_5 = _38;
  *_25 = sum.1_27;
  *_12 = _33;
  goto <bb 3>;

  <bb 3>:
  return;
...


VI.

After pass_dce, so before parloops-oacc-kernels:
...
  <bb 2>:
  _5 = *.omp_data_i_4(D).i;
  *_5 = 0;
  _44 = *.omp_data_i_4(D).n;
  _45 = *_44;
  if (_45 != 0)
    goto <bb 4>;
  else
    goto <bb 3>;

  <bb 4>:
  _12 = *.omp_data_i_4(D).j;
  _19 = *.omp_data_i_4(D).a;
  _25 = *.omp_data_i_4(D).sum;
  D__lsm.16_1 = *_25;

  <bb 5>: outer loop header
  # D__lsm.14_13 = PHI <0(4), _38(8)>
  # D__lsm.16_34 = PHI <D__lsm.16_1(4), sum.1_27(8)>
  goto <bb 7>;

  <bb 7>: inner loop header, latch
  # D__lsm.10_47 = PHI <0(5), _33(7)>
  # D__lsm.12_49 = PHI <D__lsm.16_34(5), sum.1_27(7)>
  _24 = *_19[D__lsm.14_13][D__lsm.10_47];
  sum.1_27 = _24 + D__lsm.12_49;
  _33 = D__lsm.10_47 + 1;
  j.2_16 = (unsigned int) _33;
  if (j.2_16 < _45)
    goto <bb 7>;
  else
    goto <bb 8>;

  <bb 8>: outer loop latch
  _38 = D__lsm.14_13 + 1;
  i.3_9 = (unsigned int) _38;
  if (i.3_9 < _45)
    goto <bb 5>;
  else
    goto <bb 6>;

  <bb 6>:
  *_5 = _38;
  *_25 = sum.1_27;
  *_12 = _33;
  goto <bb 3>;

  <bb 3>:
  return;
...

Thanks,
- Tom

Reply via email to