Hi! On 2022-03-01T17:46:20+0100, I wrote: > On 2022-01-13T10:54:16+0100, I wrote: >> On 2019-05-08T14:51:57+0100, Julian Brown <jul...@codesourcery.com> wrote: >>> - The "addressable" bit is set during the kernels conversion pass for >>> variables that have "create" (alloc) clauses created for them in the >>> synthesised outer data region (instead of in the front-end, etc., >>> where it can't be done accurately). Such variables actually have >>> their address taken during transformations made in a later pass >>> (omp-low, I think), but there's a phase-ordering problem that means >>> the flag should be set earlier. >> >> The actual issue is a bit different, but yes, there is a problem. >> The related ICE has also been reported as <https://gcc.gnu.org/PR100280> >> "ICE in lower_omp_target, at omp-low.c:12287". [...]
We've resolved all such known ICEs -- but still have open <https://gcc.gnu.org/PR104892> "OpenACC 'kernels' decomposition: wrong-code cases unless manually making certain variables addressable". This is avoided by: > workaround patches like > we have on the og11 development branch: > - "Avoid introducing 'create' mapping clauses for loop index variables in > kernels regions", > - "Run all kernels regions with GOMP_MAP_FORCE_TOFROM mappings > synchronously", > - "Fix for is_gimple_reg vars to 'data kernels'" ..., but the misbehavior is visible without the workaround patches, for example on the master branch. Pushed to master branch commit 535afbd959bc72de85fca36ba6417f075cca1018 "OpenACC 'kernels' decomposition: wrong-code cases unless manually making certain variables addressable [PR104892]", see attached, to "Document a few examples of the status quo". Grüße Thomas ----------------- Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
>From 535afbd959bc72de85fca36ba6417f075cca1018 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge <tho...@codesourcery.com> Date: Fri, 11 Mar 2022 15:11:25 +0100 Subject: [PATCH] OpenACC 'kernels' decomposition: wrong-code cases unless manually making certain variables addressable [PR104892] Document a few examples of the status quo. PR middle-end/104892 libgomp/ * testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c: Point to PR104892. * testsuite/libgomp.oacc-c-c++-common/default-1.c: Likewise, enable '--param=openacc-kernels=decompose' and adjust. * testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise. * testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90: Likewise. --- .../libgomp.oacc-c-c++-common/default-1.c | 14 ++++++-- .../kernels-decompose-1.c | 4 +-- .../kernels-reduction-1.c | 8 ++++- .../libgomp.oacc-c-c++-common/parallel-dims.c | 34 +++++++++++++------ .../kernels-reduction-1.f90 | 15 +++++++- 5 files changed, 59 insertions(+), 16 deletions(-) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/default-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/default-1.c index 0ac8d7132d4..fed65c8dccc 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/default-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/default-1.c @@ -1,3 +1,5 @@ +/* { dg-additional-options "--param=openacc-kernels=decompose" } */ + /* { dg-additional-options "-fopt-info-all-omp" } { dg-additional-options "-foffload=-fopt-info-all-omp" } */ @@ -63,6 +65,8 @@ int test_parallel () int test_kernels () { int val = 2; + /*TODO <https://gcc.gnu.org/PR104892> */ + (volatile int *) &val; int ary[32]; int ondev = 0; @@ -71,12 +75,18 @@ int test_kernels () /* val defaults to copy, ary defaults to copy. */ #pragma acc kernels copy(ondev) /* { dg-line l_compute[incr c_compute] } */ - /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ - /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'ondev\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'val\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ { + /* { dg-note {beginning 'gang-single' part in OpenACC 'kernels' region} {} { target *-*-* } .+1 } */ ondev = acc_on_device (acc_device_not_host); + /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target { c++ && { ! __OPTIMIZE__ } } } .-1 } + ..., as without optimizations, we're not inlining the C++ 'acc_on_device' wrapper. */ #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ + /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */ for (unsigned i = 0; i < 32; i++) { ary[i] = val; diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c index eb424776b6b..3db59e8a75c 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-decompose-1.c @@ -29,12 +29,12 @@ static int g2; static void f1 () { int a = 0; - /*TODO Without making 'a' addressable, for GCN offloading we will not see the expected value copied out. (But it does work for nvptx offloading, strange...) */ + /*TODO <https://gcc.gnu.org/PR104892> */ (volatile int *) &a; #define N 123 int b[N] = { 0 }; unsigned long long f1; - /*TODO See above. */ + /*TODO <https://gcc.gnu.org/PR104892> */ (volatile void *) &f1; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c index fbd9815f683..e7b2817a391 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-reduction-1.c @@ -1,6 +1,8 @@ /* Verify that a simple, explicit acc loop reduction works inside a kernels region. */ +/* { dg-additional-options "--param=openacc-kernels=decompose" } */ + /* { dg-additional-options "-fopt-info-all-omp" } { dg-additional-options "-foffload=-fopt-info-all-omp" } */ @@ -17,12 +19,16 @@ int main () { int i, red = 0; + /*TODO <https://gcc.gnu.org/PR104892> */ + (volatile int *) &red; #pragma acc kernels /* { dg-line l_compute1 } */ - /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_compute1 } */ + /* { dg-note {variable 'red\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute1 } */ { #pragma acc loop reduction (+:red) /* { dg-line l_loop_i1 } */ + /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i1 } */ /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i1 } */ + /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i1 } */ for (i = 0; i < N; i++) red++; } diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c index f9c7aed3a56..75e8cb510cc 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c @@ -1,6 +1,8 @@ /* OpenACC parallelism dimensions clauses: num_gangs, num_workers, vector_length. */ +/* { dg-additional-options "--param=openacc-kernels=decompose" } */ + /* { dg-additional-options "-fopt-info-all-omp" } { dg-additional-options "-foffload=-fopt-info-all-omp" } */ @@ -640,20 +642,26 @@ int main () kernels. */ { int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; + /*TODO <https://gcc.gnu.org/PR104892> */ + (volatile int *) &gangs_min, &gangs_max, &workers_min, &workers_max, &vectors_min, &vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ - /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ - /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'gangs_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'workers_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'vectors_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ { - /* This is to make the OpenACC kernels construct unparallelizable. */ - asm volatile ("" : : : "memory"); - #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) + /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */ for (int i = 100; i > -100; --i) { + /* This is to make the loop unparallelizable. */ + asm volatile ("" : : : "memory"); + gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); @@ -674,23 +682,29 @@ int main () #define WORKERS 5 #define VECTORS 13 int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; + /*TODO <https://gcc.gnu.org/PR104892> */ + (volatile int *) &gangs_min, &gangs_max, &workers_min, &workers_max, &vectors_min, &vectors_max; gangs_min = workers_min = vectors_min = INT_MAX; gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc kernels /* { dg-line l_compute[incr c_compute] } */ \ num_gangs (gangs) \ num_workers (WORKERS) \ vector_length (VECTORS) - /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ - /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'gangs_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'workers_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ + /* { dg-note {variable 'vectors_max\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute$c_compute } */ { - /* This is to make the OpenACC kernels construct unparallelizable. */ - asm volatile ("" : : : "memory"); - #pragma acc loop /* { dg-line l_loop_i[incr c_loop_i] } */ \ reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) + /* { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-note {variable 'i' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ /* { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i$c_loop_i } */ + /* { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i$c_loop_i } */ for (int i = 100; i > -100; --i) { + /* This is to make the loop unparallelizable. */ + asm volatile ("" : : : "memory"); + gangs_min = gangs_max = acc_gang (); workers_min = workers_max = acc_worker (); vectors_min = vectors_max = acc_vector (); diff --git a/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 b/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 index 6ff740efc32..89bae49c94c 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 +++ b/libgomp/testsuite/libgomp.oacc-fortran/kernels-reduction-1.f90 @@ -2,6 +2,8 @@ ! { dg-do run } +! { dg-additional-options "--param=openacc-kernels=decompose" } + ! { dg-additional-options "-fopt-info-all-omp" } ! { dg-additional-options "-foffload=-fopt-info-all-omp" } */ @@ -13,17 +15,28 @@ program reduction integer, parameter :: n = 20 integer :: i, red + !TODO <https://gcc.gnu.org/PR104892> + call make_addressable (red) red = 0 !$acc kernels ! { dg-line l_compute1 } */ - ! { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_compute1 } + ! { dg-note {variable 'red\.[0-9]+' declared in block isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_compute1 } !$acc loop reduction (+:red) ! { dg-line l_loop_i1 } + ! { dg-note {forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis} {} { target *-*-* } l_loop_i1 } ! { dg-note {variable 'i' in 'private' clause isn't candidate for adjusting OpenACC privatization level: not addressable} {} { target *-*-* } l_loop_i1 } + ! { dg-optimized {assigned OpenACC seq loop parallelism} {} { target *-*-* } l_loop_i1 } do i = 1, n red = red + 1 end do !$acc end kernels if (red .ne. n) stop 1 + +contains + + subroutine make_addressable (v) + integer :: v ! by reference + end subroutine make_addressable + end program reduction -- 2.34.1