https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87835

--- Comment #4 from Tom de Vries <vries at gcc dot gnu.org> ---
This minimized test-case (rewritten to avoid the kernels construct, by setting
the num_gangs as libgomp would have chosen it for kernels, and making the loop
a  gang loop):
...
/* { dg-do run } */
/* { dg-additional-options "-lcuda" { target openacc_nvidia_accel_selected } }
*/

#include <openacc.h>
#include <stdlib.h>
#include "cuda.h"

#include <stdio.h>

#define n 128

int
main (void)
{
  CUresult r;
  CUstream stream1;
  int N = n;
  int a[n];
  int b[n];
  int c[n];

  acc_init (acc_device_nvidia);

  r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  acc_set_cuda_stream (1, stream1);

  for (int i = 0; i < n; i++)
    {
      a[i] = 3;
      c[i] = 0;
    }

#pragma acc data copy (a, b, c) copyin (N)
  {
#pragma acc parallel async (1)
    ;

#pragma acc parallel async (1) num_gangs (320)
#pragma loop gang
    for (int ii = 0; ii < N; ii++)
      c[ii] = (a[ii] + a[N - ii - 1]);

#pragma acc parallel async (1)
#pragma acc loop seq
    for (int ii = 0; ii < n; ii++)
      a[ii] = 6;

#pragma acc wait (1)
  }

  unsigned sum = 0;
  for (int i = 0; i < n; i++)
    if (c[i] != 6)
      {
        printf ("%d@%d ", c[i], i);
        sum++;
      }

  if (sum > 0)
    {
      printf ("mismatches: %u\n", sum);
      abort ();
    }

  return 0;
}
...

reproduces 100 out of 100 for me at -O2:
...
nr=100; sum=0; for n in $(seq 1 $nr); do ./run.sh ./asyncwait-1.exe ; if [ $?
-eq 0 ]; then sum=$(($sum + 1)); fi; done; echo ; echo "$sum/$nr"
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
15@125 27@126 39@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
12@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 30@127 mismatches: 3
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
21@126 30@127 mismatches: 2
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 24@127 mismatches: 2
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
15@125 24@126 39@127 mismatches: 3
Aborted (core dumped)
18@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 21@126 30@127 mismatches: 3
Aborted (core dumped)
18@125 27@126 39@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 21@126 36@127 mismatches: 3
Aborted (core dumped)
15@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
18@126 27@127 mismatches: 2
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
18@126 27@127 mismatches: 2
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
15@126 30@127 mismatches: 2
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
9@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
15@126 30@127 mismatches: 2
Aborted (core dumped)
15@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@126 21@127 mismatches: 2
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 30@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@124 15@125 27@126 42@127 mismatches: 4
Aborted (core dumped)
12@126 27@127 mismatches: 2
Aborted (core dumped)
12@125 21@126 36@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
18@126 33@127 mismatches: 2
Aborted (core dumped)
18@126 27@127 mismatches: 2
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 39@127 mismatches: 3
Aborted (core dumped)
15@126 30@127 mismatches: 2
Aborted (core dumped)
12@125 24@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
9@125 18@126 30@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 30@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 18@126 33@127 mismatches: 3
Aborted (core dumped)
15@125 27@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
12@125 24@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 21@126 33@127 mismatches: 3
Aborted (core dumped)
15@126 27@127 mismatches: 2
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
18@126 33@127 mismatches: 2
Aborted (core dumped)
18@126 30@127 mismatches: 2
Aborted (core dumped)
15@126 30@127 mismatches: 2
Aborted (core dumped)
12@125 21@126 36@127 mismatches: 3
Aborted (core dumped)
9@125 24@126 33@127 mismatches: 3
Aborted (core dumped)

0/100
...

This doesn't just look like an order of execution problem.

If we have the "a[ii] = 6" loop executed before the "c[ii] = (a[ii] + a[N - ii
- 1])" loop, still the maximum to be expected for each c element is 12. But we
see higher values here.

Reverting "[nvptx] Remove use of CUDA unified memory in libgomp" makes it pass
100 out of 100.

By forcing map_push to create a new map, rather than to reuse:
...
@@ -304,7 +304,7 @@ map_push (struct ptx_stream *s, size_t size)
   /* Each PTX stream requires a separate data region to store the              
      launch arguments for cuLaunchKernel.  Allocate a new                      
      cuda_map and push it to the end of the list.  */                          
-  if (s->map->active)                                                          
+  if (true)                                                                    
     {                                                                          
       map = cuda_map_create (size);                                            
...
the failure is turned into a duplicate of the more trivial PR88946 - "[nvptx,
openacc, libgomp] cuMemAlloc error for two empty asynchronous parallels".

So, the most likely cause for the error is that map_push returns a map that is
marked inactive, but in fact still in use. This could happen if a launch
completes and the event processing calls map_pop for the wrong map.

Using this debug patch:
...
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index dd2bcf3083f..158ba67d273 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -284,11 +284,13 @@ map_pop (struct ptx_stream *s)

   if (s->map->next == NULL)
     {
+      fprintf (stderr, "map_push makes map inactive: %p\n", s->map);
       s->map->active = false;
       return;
     }

   next = s->map->next;
+  fprintf (stderr, "map_push destroys map: %p\n", s->map);
   cuda_map_destroy (s->map);
   s->map = next;
 }
@@ -323,6 +325,7 @@ map_push (struct ptx_stream *s, size_t size)

   s->map = map;
   s->map->active = true;
+  fprintf (stderr, "map_push returns map: %p\n", s->map);

   return s->map->d;
 }
...

we see:
...
map_push returns map: 0x1e94260
map_push returns map: 0x1e913c0
map_push makes map inactive: 0x1e913c0
map_push returns map: 0x1e913c0
map_push makes map inactive: 0x1e913c0
map_push makes map inactive: 0x1e913c0
15@126 30@127 mismatches: 2
Aborted (core dumped)
...

That indeed seems wrong:

> map_push returns map: 0x1e94260
map for first parallel is allocated

> map_push returns map: 0x1e913c0
map for second parallel is allocated

> map_push makes map inactive: 0x1e913c0

First kernel finishes. This should not touch the map for the second parallel.


The root cause is that map_push is broken: after the second map_push, the map
list consist just of the second item pushed, the first item is dropped.

Reply via email to