On 17/11/15 15:53, Tom de Vries wrote:
And the above LIM example
is none for why you need two LIM passes...
Indeed. I'm planning a separate reply to explain in more detail the need
for the two pass_lims.
I.
I managed to get rid of the two pass_lims for the motivating example
that I used until now (goacc/kernels-double-reduction.c). I found that
by adding a pass_dominator instance after pass_ch, I could get rid of
the second pass_lim (and pass_copyprop as well).
But... then I wrote a counter example
(goacc/kernels-double-reduction-n.c), and I'm back at two pass_lims (and
two pass_dominators).
Also I've split the pass group into a bit before and after pass_fre.
So, the current pass group looks like:
...
NEXT_PASS (pass_build_ealias);
/* Pass group that runs when the function is an offloaded function
containing oacc kernels loops. Part 1. */
NEXT_PASS (pass_oacc_kernels);
PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels)
/* We need pass_ch here, because pass_lim has no effect on
exit-first loops (PR65442). Ideally we want to remove both
this pass instantiation, and the reverse transformation
transform_to_exit_first_loop_alt, which is done in
pass_parallelize_loops_oacc_kernels. */
NEXT_PASS (pass_ch);
POP_INSERT_PASSES ()
NEXT_PASS (pass_fre);
/* Pass group that runs when the function is an offloaded function
containing oacc kernels loops. Part 2. */
NEXT_PASS (pass_oacc_kernels2);
PUSH_INSERT_PASSES_WITHIN (pass_oacc_kernels2)
/* We use pass_lim to rewrite in-memory iteration and reduction
variable accesses in loops into local variables accesses. */
NEXT_PASS (pass_lim);
NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
NEXT_PASS (pass_lim);
NEXT_PASS (pass_dominator, false /* may_peel_loop_headers_p */);
NEXT_PASS (pass_dce);
NEXT_PASS (pass_parallelize_loops_oacc_kernels);
NEXT_PASS (pass_expand_omp_ssa);
POP_INSERT_PASSES ()
NEXT_PASS (pass_merge_phi);
...
II.
The motivating test-case kernels-double-reduction-n.c:
...
#include <stdlib.h>
#define N 500
unsigned int a[N][N];
void __attribute__((noinline,noclone))
foo (unsigned int n)
{
int i, j;
unsigned int sum = 1;
#pragma acc kernels copyin (a[0:n]) copy (sum)
{
for (i = 0; i < n; ++i)
for (j = 0; j < n; ++j)
sum += a[i][j];
}
if (sum != 5001)
abort ();
}
...
III.
Before first pass_lim. Note no phis on inner or outer loop header for
iteration varables or reduction variable:
...
<bb 2>:
_5 = *.omp_data_i_4(D).i;
*_5 = 0;
_44 = *.omp_data_i_4(D).n;
_45 = *_44;
if (_45 != 0)
goto <bb 4>;
else
goto <bb 3>;
<bb 4>: outer loop header
_12 = *.omp_data_i_4(D).j;
*_12 = 0;
if (_45 != 0)
goto <bb 6>;
else
goto <bb 5>;
<bb 6>: inner loop header, latch
_19 = *.omp_data_i_4(D).a;
_21 = *_5;
_23 = *_12;
_24 = *_19[_21][_23];
_25 = *.omp_data_i_4(D).sum;
sum.0_26 = *_25;
sum.1_27 = _24 + sum.0_26;
*_25 = sum.1_27;
_33 = _23 + 1;
*_12 = _33;
j.2_16 = (unsigned int) _33;
if (j.2_16 < _45)
goto <bb 6>;
else
goto <bb 5>;
<bb 5>: outer loop latch
_36 = *_5;
_38 = _36 + 1;
*_5 = _38;
i.3_9 = (unsigned int) _38;
if (i.3_9 < _45)
goto <bb 4>;
else
goto <bb 3>;
<bb 3>:
return;
...
IV.
After first pass_lim/pass_dom pair. Note there are phis on the inner
loop header for the reduction and the iteration variable, but not on the
outer loop header:
...
<bb 2>:
_5 = *.omp_data_i_4(D).i;
*_5 = 0;
_44 = *.omp_data_i_4(D).n;
_45 = *_44;
if (_45 != 0)
goto <bb 4>;
else
goto <bb 3>;
<bb 4>:
_12 = *.omp_data_i_4(D).j;
_19 = *.omp_data_i_4(D).a;
D__lsm.10_50 = *_12;
D__lsm.11_51 = 0;
_25 = *.omp_data_i_4(D).sum;
<bb 5>: outer loop header
D__lsm.10_20 = 0;
D__lsm.11_22 = 1;
_21 = *_5;
D__lsm.12_28 = *_25;
D__lsm.13_30 = 0;
goto <bb 7>;
<bb 7>: inner loop header, latch
# D__lsm.10_47 = PHI <0(5), _33(7)>
# D__lsm.12_49 = PHI <D__lsm.12_28(5), sum.1_27(7)>
_23 = D__lsm.10_47;
_24 = *_19[_21][D__lsm.10_47];
sum.0_26 = D__lsm.12_49;
sum.1_27 = _24 + D__lsm.12_49;
D__lsm.12_31 = sum.1_27;
D__lsm.13_32 = 1;
_33 = D__lsm.10_47 + 1;
D__lsm.10_14 = _33;
D__lsm.11_15 = 1;
j.2_16 = (unsigned int) _33;
if (j.2_16 < _45)
goto <bb 7>;
else
goto <bb 8>;
<bb 8>: outer loop latch
# D__lsm.10_35 = PHI <_33(7)>
# D__lsm.11_37 = PHI <1(7)>
# D__lsm.12_7 = PHI <sum.1_27(7)>
# D__lsm.13_8 = PHI <1(7)>
*_25 = sum.1_27;
_36 = *_5;
_38 = _36 + 1;
*_5 = _38;
i.3_9 = (unsigned int) _38;
if (i.3_9 < _45)
goto <bb 5>;
else
goto <bb 6>;
<bb 6>:
# D__lsm.10_10 = PHI <_33(8)>
# D__lsm.11_11 = PHI <1(8)>
*_12 = _33;
goto <bb 3>;
<bb 3>:
return;
...
V.
After second pass_lim/pass_dom pair. Note there are phis on the inner
and outer loop header for the reduction and the iteration variables:
...
<bb 2>:
_5 = *.omp_data_i_4(D).i;
*_5 = 0;
_44 = *.omp_data_i_4(D).n;
_45 = *_44;
if (_45 != 0)
goto <bb 4>;
else
goto <bb 3>;
<bb 4>:
_12 = *.omp_data_i_4(D).j;
_19 = *.omp_data_i_4(D).a;
D__lsm.10_50 = *_12;
D__lsm.11_51 = 0;
_25 = *.omp_data_i_4(D).sum;
D__lsm.14_40 = 0;
D__lsm.15_2 = 0;
D__lsm.16_1 = *_25;
D__lsm.17_46 = 0;
<bb 5>: outer loop header
# D__lsm.14_13 = PHI <0(4), _38(8)>
# D__lsm.16_34 = PHI <D__lsm.16_1(4), sum.1_27(8)>
D__lsm.10_20 = 0;
D__lsm.11_22 = 1;
_21 = D__lsm.14_13;
D__lsm.12_28 = D__lsm.16_34;
D__lsm.13_30 = 0;
goto <bb 7>;
<bb 7>: inner loop header, latch
# D__lsm.10_47 = PHI <0(5), _33(7)>
# D__lsm.12_49 = PHI <D__lsm.16_34(5), sum.1_27(7)>
_23 = D__lsm.10_47;
_24 = *_19[D__lsm.14_13][D__lsm.10_47];
sum.0_26 = D__lsm.12_49;
sum.1_27 = _24 + D__lsm.12_49;
D__lsm.12_31 = sum.1_27;
D__lsm.13_32 = 1;
_33 = D__lsm.10_47 + 1;
D__lsm.10_14 = _33;
D__lsm.11_15 = 1;
j.2_16 = (unsigned int) _33;
if (j.2_16 < _45)
goto <bb 7>;
else
goto <bb 8>;
<bb 8>: outer loop latch
# D__lsm.10_35 = PHI <_33(7)>
# D__lsm.11_37 = PHI <1(7)>
# D__lsm.12_7 = PHI <sum.1_27(7)>
# D__lsm.13_8 = PHI <1(7)>
# sum.1_48 = PHI <sum.1_27(7)>
# _53 = PHI <_33(7)>
D__lsm.16_56 = sum.1_27;
D__lsm.17_57 = 1;
_36 = D__lsm.14_13;
_38 = D__lsm.14_13 + 1;
D__lsm.14_58 = _38;
D__lsm.15_59 = 1;
i.3_9 = (unsigned int) _38;
if (i.3_9 < _45)
goto <bb 5>;
else
goto <bb 6>;
<bb 6>:
# D__lsm.10_10 = PHI <_33(8)>
# D__lsm.11_11 = PHI <1(8)>
# _43 = PHI <_33(8)>
# D__lsm.16_62 = PHI <sum.1_27(8)>
# D__lsm.17_63 = PHI <1(8)>
# D__lsm.14_64 = PHI <_38(8)>
# D__lsm.15_65 = PHI <1(8)>
*_5 = _38;
*_25 = sum.1_27;
*_12 = _33;
goto <bb 3>;
<bb 3>:
return;
...
VI.
After pass_dce, so before parloops-oacc-kernels:
...
<bb 2>:
_5 = *.omp_data_i_4(D).i;
*_5 = 0;
_44 = *.omp_data_i_4(D).n;
_45 = *_44;
if (_45 != 0)
goto <bb 4>;
else
goto <bb 3>;
<bb 4>:
_12 = *.omp_data_i_4(D).j;
_19 = *.omp_data_i_4(D).a;
_25 = *.omp_data_i_4(D).sum;
D__lsm.16_1 = *_25;
<bb 5>: outer loop header
# D__lsm.14_13 = PHI <0(4), _38(8)>
# D__lsm.16_34 = PHI <D__lsm.16_1(4), sum.1_27(8)>
goto <bb 7>;
<bb 7>: inner loop header, latch
# D__lsm.10_47 = PHI <0(5), _33(7)>
# D__lsm.12_49 = PHI <D__lsm.16_34(5), sum.1_27(7)>
_24 = *_19[D__lsm.14_13][D__lsm.10_47];
sum.1_27 = _24 + D__lsm.12_49;
_33 = D__lsm.10_47 + 1;
j.2_16 = (unsigned int) _33;
if (j.2_16 < _45)
goto <bb 7>;
else
goto <bb 8>;
<bb 8>: outer loop latch
_38 = D__lsm.14_13 + 1;
i.3_9 = (unsigned int) _38;
if (i.3_9 < _45)
goto <bb 5>;
else
goto <bb 6>;
<bb 6>:
*_5 = _38;
*_25 = sum.1_27;
*_12 = _33;
goto <bb 3>;
<bb 3>:
return;
...
Thanks,
- Tom