On Wed, Jun 17, 2020 at 02:29:40PM +0200, Richard Biener wrote:
> So this remembers me of the loop scalarization pass Sebastian once
> implemented - that is, with the above constraints it looks like we
> can do the actual collapsing, replacing collapsed loops with a new
> one with a canonical IV from 0 to N (thus another rectangular one).
> The old IVs would be either directly computed from that N
> (but that requires costly modulo) or be derived IVs that would
> not be affine (because they are "wrapping", aka reset).
For rectangular loop nests it is something that omp-expand.c does
already since June 2008, the major difference is that it is far easier
to rectangular loops, total number of iterations of the loop nest is
a product of number of iterations in each associated loop and
to determine the user IVs from the collapsed 0 to N can be done using
modulo.
The comments in omp-expand.c describe it:
zero3 = N32 c3 N31;
count3 = (N32 - N31) /[cl] STEP3;
zero2 = N22 c2 N21;
count2 = (N22 - N21) /[cl] STEP2;
zero1 = N12 c1 N11;
count1 = (N12 - N11) /[cl] STEP1;
zero = zero3 || zero2 || zero1;
count = count1 * count2 * count3;
if (__builtin_expect(zero, false)) goto zero_iter_bb;
and
/* Helper function for expand_omp_{for_*,simd}. Generate code like:
T = V;
V3 = N31 + (T % count3) * STEP3;
T = T / count3;
V2 = N21 + (T % count2) * STEP2;
T = T / count2;
V1 = N11 + T * STEP1;
if this loop doesn't have an inner loop construct combined with it.
If it does have an inner loop construct combined with it and the
iteration count isn't known constant, store values from counts array
into its _looptemp_ temporaries instead. */
What I need now is just extend this to the non-rectangular loops
and that involves Summæ Potestatum for the count number of iterations and
finding roots of quadratic/cubic etc. equations for the second step.
> Are the loops required to be perfectly nesting? At least those
> that collapse?
Before OpenMP 5.0 they are required to be perfectly nested, 5.0 essentially
allows some non-perfect nesting but says that it is unspecified how many
times the extra code that makes the loop not perfectly nested is executed,
so it should be possible to e.g. move it into the innermost loop body if
needed, or perform just once for the outer loop iterator etc.
Anyway, here is the updated proof of concept which should use n*(n+1)/2
and isqrt even if the inner loop has no iterations for some values of the
outer one.
CCing also Sebastian if he has some ideas.
Jakub
/* Proof of concept for OpenMP non-rectangular worksharing-loop
implementation.
Copyright (C) 2020 Free Software Foundation, Inc.
GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.
GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
for more details.
You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3. If not see
<http://www.gnu.org/licenses/>. */
#include <stdlib.h>
#include <stdio.h>
#ifdef _OPENMP
#include <omp.h>
#endif
int x, i, j, nitersv, niterscnt;
#ifdef DEBUG
#define DPRINTF(...) printf (__VA_ARGS__)
#else
#define DPRINTF(...) do {} while (0)
#endif
#ifndef _OPENMP
#define omp_get_num_threads() 1
#define omp_get_thread_num() 0
#endif
void
foo (int a, int b, int c, int d, int e, int f, int g, int h)
{
#ifdef DEBUG
#pragma omp single
nitersv = 0;
#endif
#if 0
#pragma omp for collapse(2) lastprivate (i, j, x)
#else
#pragma omp single
#endif
for (i = a; i < b; i += c)
for (j = d * i + e; j < f * i + g; j += h)
{
x = i * 1024 + (j & 1023);
#ifdef DEBUG
nitersv++;
#endif
DPRINTF ("%d %d %d %d\n", i, j, x, omp_get_thread_num ());
}
#ifdef DEBUG
#pragma omp single
DPRINTF ("niters = %d\n", nitersv);
#endif
}
void
bar (int a, int b, int c, int d, int e, int f, int g, int h)
{
/* Proposed implementation of:
#pragma omp for collapse(2) lastprivate (i, j, x)
for (i = a; i < b; i += c)
for (j = d * i + e; j < f * i + g; j += h) */
/* OpenMP requires that ((f - d) * c) % h == 0
and that either the initializer and condition expressions
are outermost loop invariant, or have syntactic forms that can
be represented as integral a1 * var-outer + a2 where var-outer
is some outer loop iterator with compatible type and a1 and a2 are
integral expressions (and have compatible type too), which are outermost
loop invariant. Comparisons can be <, <=, >, >= or != but in the last
case the step is required to be compile time constant so that one
can determine iteration direction and for the others the step has to
match the iteration direction of the comparison operator.
And also a requirement that essentially says that there
is no wrap-around in any of the iterators and that the number of
iterations can be computed without risks of overflows/wrap-arounds.
Any number of loops can be collapsed and all but the outermost can be
non-rectangular (or at least potentially one, basically where the
expressions refer to the outer loop iterator). The step expressions
must be always outermost loop invariant. */
/* First try to calculate the total number of iterations.
Can be simplified by computing all outermost rectangular loops
whose iterator vars are not referenced in the non-rectangular loops
separately, and similarly all innermost rectangular loops separately. */
int niters = 0;
if (__builtin_expect (!(a < b), 0))
{
/* No iterations at all, only i defined after loop. */
i = a;
goto end;
}
/* If the (middle) non-rectangular loops are triangular (or perhaps in some
more cases using Faulhaber's formula?), check if for all the iterators the
inner loop will have at least one iteration.
If all of a, b, c, d, e, f, g, h are compile time constants, we want
to compute niters at compile time obviously. If only some of them
are constant, let the normal optimizations simplify the expressions
correspondingly. */
int as = a;
int bs = b;
repeat:;
int t4 = (bs + (c - 1) - as) / c;
int t5 = as + ((t4 - 1) * c);
int t8 = d * as + e;
int t9 = f * as + g;
int t8e = d * t5 + e;
int t9e = f * t5 + g;
int t10, t11;
if (__builtin_expect (!(t8 < t9), 0))
{
if (__builtin_expect (!(t8e < t9e), 0))
{
/* No iterations at all, only i defined after loop. */
i = a;
goto end;
}
int t17 = (e - g) / (f - d);
t17 -= (t17 - a) % c;
if (d * t17 + e < f * t17 + g)
{
/* assert (d * (t17 - c) + e >= f * (t17 - c) + g); */
as = t17;
}
else
{
/* assert (d * (t17 + c) + e < f * (t17 + c) + g); */
as = t17 + c;
}
goto repeat;
}
else if (__builtin_expect (!(t8e < t9e), 0))
{
int t18 = (e - g) / (f - d);
t18 -= (t18 - a) % c;
if (d * t18 + e >= f * t18 + g)
{
/* assert (d * (t18 - c) + e < f * (t18 - c) + g); */
bs = t18;
}
else
{
/* assert (d * (t18 + c) + e >= f * (t18 + c) + g); */
bs = t18 + c;
}
goto repeat;
}
if (1)
{
/* assert (t8 < t9 && t8e < t9e); */
t10 = ((t9 + (h - 1) - t8) / h);
t11 = ((f - d) * c / h);
niters = t4 * t10 + t11 * (((t4 - 1) * t4) / 2);
}
else
{
t10 = t11 = 0;
/* Fallback implementation, if it above gets too ugly/hard. Repeat all
loops except the innermost, hope loop optimizations optimize at least
something. */
for (int t1 = a; t1 < b; t1 += c)
{
int t2 = d * t1 + e;
int t3 = f * t1 + g;
if (t2 < t3)
niters += (t3 + (h - 1) - t2) / h;
}
}
/* Second step, the usual GCC OpenMP schedule(static) computation
of which iterations should the current thread take. */
int num_threads = omp_get_num_threads ();
int thr = omp_get_thread_num ();
int t6 = niters / num_threads;
int t7 = niters % num_threads;
if (thr < t7) { t7 = 0; t6++; }
int start = t6 * thr + t7;
int end = start + t6;
if (!(start < end))
goto end;
/* Now, start contains the first logical iteration that this
thread should handle (counted from 0) and end should be the
first one it should already not handle. */
/* Third step, from start try to determine the initial values of
the loop iteration variables. */
int ipriv, jpriv;
if (t10)
{
/* We want to find maximum x such that
start >= x * t10 + t11 * (((x - 1) * x) / 2)
and from that x compute both indexes:
ipriv = as + x * c;
jpriv = d * ipriv + e + (start - (x * t10 + t11 * (((x - 1) * x) /
2))) * h. */
/* x = (isqrt((t10-t11/2)*(t10-t11/2)+start)-(t10-t11/2)) / t11 would be
a rough
guess that needs to be verified. */
int t12 = t10 - t11 / 2;
/* Quick overflow check for the ^2. */
if (__builtin_expect (t12 + 45000U > 90000U, 0))
goto fallback;
unsigned t13 = t12 * t12 + (unsigned) start;
if (__builtin_expect (t13 == 0 || t11 == 0, 0))
goto fallback;
/* Compute isqrt(t13). */
unsigned isqrtb = (1U << (__SIZEOF_INT__ * __CHAR_BIT__ + 1
- __builtin_clz (t13)) /2) - 1;
unsigned isqrta = (isqrtb + 3) / 2;
do {
unsigned isqrtm = (isqrta + isqrtb) >> 1;
if (isqrtm * isqrtm > t13)
isqrtb = isqrtm - 1;
else
isqrta = isqrtm + 1;
} while (isqrtb >= isqrta);
unsigned isqrt = isqrta - 1;
unsigned t14 = (isqrt - t12) / t11;
unsigned t15 = t14 * t10 + t11 * (((t14 - 1) * t14) / 2);
if (__builtin_expect (start >= t15, 1))
{
unsigned t16 = t15 + t10 + t14;
if (__builtin_expect (start >= t16, 0))
goto fallback;
}
else
{
unsigned t16 = t15 - t10 - t14;
if (__builtin_expect (start < t16, 0))
goto fallback;
t14--;
t15 = t16;
}
ipriv = as + (int) t14 * c;
jpriv = d * ipriv + e + (start - (int) t15) * h;
}
else
{
/* Fallback implementation, if it above gets too ugly/hard. Repeat all
loops except the innermost, hope loop optimizations optimize at least
something. */
fallback:;
int cnt = 0;
for (int t1 = as; t1 < bs; t1 += c)
{
int t2 = d * t1 + e;
int t3 = f * t1 + g;
if (t2 < t3)
{
int t8 = (t3 + (h - 1) - t2) / h;
if (cnt + t8 > start)
{
ipriv = t1;
jpriv = t2 + (start - cnt) * h;
goto done;
}
else
cnt += t8;
}
}
done:;
}
int jmax = f * ipriv + g;
int xpriv;
do
{
/* Now the body, with the privatized copies of the loop iterators
as well as other privatized variables as usual in OpenMP. */
{
xpriv = ipriv * 1024 + (jpriv & 1023);
DPRINTF ("%d %d %d %d\n", ipriv, jpriv, xpriv, omp_get_thread_num ());
}
#ifdef DEBUG
if (ipriv < a || ipriv >= b)
abort ();
if ((ipriv - a) % c)
abort ();
if (jpriv < d * ipriv + e || jpriv >= f * ipriv + g)
abort ();
if ((jpriv - (d * ipriv + e)) % h)
abort ();
#pragma omp atomic
++niterscnt;
#endif
/* Use start as the logical iteration counter. */
start++;
if (!(start < end))
break;
/* Now bump the innermost iterator. */
jpriv += h;
if (jpriv < jmax)
continue;
/* The outermost iterator doesn't need condition checking, we have done
that
already through the start < end check. */
ipriv += c;
/* Or precompute earlier how to bump jmax and jmin less expensively? */
jpriv = d * ipriv + e;
jmax = f * ipriv + g;
}
while (1);
/* Lastprivate handling. */
if (start == niters)
{
/* The thread that has been assigned the last iteration will handle this.
*/
/* The variables other than iterators are very easy. */
x = xpriv;
/* The iterators can be harder, at least in cases where the innermost
loop is not (or might not) be executed at all for some of the outer
loop iterator values. */
/* Try to do something smarter for the cases where the first phase
proved that is not the case? */
/* As fallback, continue iterating with empty bodies the outer loops
until all the conditions fail. */
jpriv += h;
do
{
ipriv += c;
if (!(ipriv < b))
break;
jpriv = d * ipriv + e;
}
while (1);
/* And assign those to the shared variables. */
i = ipriv;
j = jpriv;
}
end:;
DPRINTF ("niters %d\n", niters);
}
volatile int v;
int
main ()
{
x = i = j = -1;
#pragma omp parallel num_threads(15)
foo (v + 4, v + 10, v + 1, v + 2, v - 9, v + 1, v, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 10 || j != 9 || x != 8 * 1024 + 7)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
bar (v + 4, v + 10, v + 1, v + 2, v - 9, v + 1, v, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 10 || j != 9 || x != 8 * 1024 + 7)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
foo (v + 1, v + 10, v + 2, v + 0, v + 1, v + 1, v + 1, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 11 || j != 10 || x != 9 * 1024 + 9)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
bar (v + 1, v + 10, v + 2, v + 0, v + 1, v + 1, v + 1, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 11 || j != 10 || x != 9 * 1024 + 9)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
foo (v + 4, v + 8, v + 12, v - 8, v - 9, v - 3, v + 6, v + 15);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 16 || j != 4 || x != 5 * 1024 - 11)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
bar (v + 4, v + 8, v + 12, v - 8, v - 9, v - 3, v + 6, v + 15);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 16 || j != 4 || x != 5 * 1024 - 11)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
foo (v - 13, v + 7, v + 12, v + 3, v + 5, v, v - 6, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 11 || j != 2 || x != -12 * 1024 - 7)
abort ();
DPRINTF ("===\n");
x = i = j = -1;
#pragma omp parallel num_threads(15)
bar (v - 13, v + 7, v + 12, v + 3, v + 5, v, v - 6, v + 1);
DPRINTF ("last %d %d %d\n", i, j, x);
if (i != 11 || j != 2 || x != -12 * 1024 - 7)
abort ();
DPRINTF ("===\n");
for (int idx = 0; idx < 16384 * 1024 * 64; idx++)
{
int a = (random () & 63) - 32;
int b = (random () & 63) - 32;
int c = (random () & 31) + 1;
int d = (random () & 63) - 32;
int e = (random () & 63) - 32;
int f = (random () & 63) - 32;
int g = (random () & 63) - 32;
int h = (random () & 31) + 1;
while (((f - d) * c % h) != 0)
h = (random () & 31) + 1;
x = i = j = -1;
#pragma omp parallel num_threads(15)
foo (a, b, c, d, e, f, g, h);
int xs = x;
int is = i;
int js = j;
x = i = j = -1;
niterscnt = 0;
#pragma omp parallel num_threads(15)
bar (a, b, c, d, e, f, g, h);
#ifdef DEBUG
if (nitersv != niterscnt)
abort ();
#endif
}
return 0;
}