Hi Thomas,

On 21.03.23 16:53, Thomas Schwinge wrote:
On 2022-08-26T11:07:28+0200, Tobias Burnus <tob...@codesourcery.com>
wrote:
This patch adds initial [OpenMP reverse offload] support for nvptx.
CUDA does lockup when trying to copy data from the currently running
stream; hence, a new stream is generated to do the memory copying.
As part of other work, where I had to touch those special code paths, I
found that we may reduce complexity a little bit "by using the existing
'goacc_asyncqueue' instead of re-coding parts of it".  OK to push
"libgomp: Simplify OpenMP reverse offload host <-> device memory copy 
implementation"
(still testing), see attached?

I don't think that just calling "exit (EXIT_FAILURE);" is the the proper
way – I think that should be GOMP_PLUGIN_fatal in the plugin and
gomp_fatal in target.c.

Otherwise, it LGTM.

Tobias

Subject: [PATCH] libgomp: Simplify OpenMP reverse offload host <-> device
  memory copy implementation

... by using the existing 'goacc_asyncqueue' instead of re-coding parts of it.

Follow-up to commit 131d18e928a3ea1ab2d3bf61aa92d68a8a254609
"libgomp/nvptx: Prepare for reverse-offload callback handling",
and commit ea4b23d9c82d9be3b982c3519fe5e8e9d833a6a8
"libgomp: Handle OpenMP's reverse offloads".

      libgomp/
      * target.c (gomp_target_rev): Instead of 'dev_to_host_cpy',
      'host_to_dev_cpy', 'token', take a single 'goacc_asyncqueue'.
      * libgomp.h (gomp_target_rev): Adjust.
      * libgomp-plugin.c (GOMP_PLUGIN_target_rev): Adjust.
      * libgomp-plugin.h (GOMP_PLUGIN_target_rev): Adjust.
      * plugin/plugin-gcn.c (process_reverse_offload): Adjust.
      * plugin/plugin-nvptx.c (rev_off_dev_to_host_cpy)
      (rev_off_host_to_dev_cpy): Remove.
      (GOMP_OFFLOAD_run): Adjust.
---
  libgomp/libgomp-plugin.c      |   7 +--
  libgomp/libgomp-plugin.h      |   6 +-
  libgomp/libgomp.h             |   5 +-
  libgomp/plugin/plugin-gcn.c   |   2 +-
  libgomp/plugin/plugin-nvptx.c |  77 ++++++++++++++-----------
  libgomp/target.c              | 102 +++++++++++++++-------------------
  6 files changed, 96 insertions(+), 103 deletions(-)

diff --git a/libgomp/libgomp-plugin.c b/libgomp/libgomp-plugin.c
index 27e7c94ba9b..d696515eeb6 100644
--- a/libgomp/libgomp-plugin.c
+++ b/libgomp/libgomp-plugin.c
@@ -82,11 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
  void
  GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t 
devaddrs_ptr,
                      uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-                     void (*dev_to_host_cpy) (void *, const void *, size_t,
-                                              void *),
-                     void (*host_to_dev_cpy) (void *, const void *, size_t,
-                                              void *), void *token)
+                     struct goacc_asyncqueue *aq)
  {
    gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, 
dev_num,
-                dev_to_host_cpy, host_to_dev_cpy, token);
+                aq);
  }
diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h
index 28267f75f7a..42ee3d6c7f9 100644
--- a/libgomp/libgomp-plugin.h
+++ b/libgomp/libgomp-plugin.h
@@ -121,11 +121,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
      __attribute__ ((noreturn, format (printf, 1, 2)));

  extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
-                                 uint64_t, int,
-                                 void (*) (void *, const void *, size_t,
-                                           void *),
-                                 void (*) (void *, const void *, size_t,
-                                           void *), void *);
+                                 uint64_t, int, struct goacc_asyncqueue *);

  /* Prototypes for functions implemented by libgomp plugins.  */
  extern const char *GOMP_OFFLOAD_get_name (void);
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index ba8fe348aba..4d2bfab4b71 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -1130,10 +1130,7 @@ extern void gomp_init_targets_once (void);
  extern int gomp_get_num_devices (void);
  extern bool gomp_target_task_fn (void *);
  extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
-                          int,
-                          void (*) (void *, const void *, size_t, void *),
-                          void (*) (void *, const void *, size_t, void *),
-                          void *);
+                          int, struct goacc_asyncqueue *);

  /* Splay tree definitions.  */
  typedef struct splay_tree_node_s *splay_tree_node;
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 347803762eb..2181bf0235f 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -1949,7 +1949,7 @@ process_reverse_offload (uint64_t fn, uint64_t mapnum, 
uint64_t hostaddrs,
  {
    int dev_num = dev_num64;
    GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
-                       NULL, NULL, NULL);
+                       NULL);
  }

  /* Output any data written to console output from the kernel.  It is expected
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 5bd5a419e0e..4a710851ee5 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -56,6 +56,7 @@
  #include <unistd.h>
  #include <assert.h>
  #include <errno.h>
+#include <stdlib.h>

  /* An arbitrary fixed limit (128MB) for the size of the OpenMP soft stacks
     block to cache between kernel invocations.  For soft-stacks blocks bigger
@@ -1739,11 +1740,11 @@ GOMP_OFFLOAD_openacc_cuda_set_stream (struct 
goacc_asyncqueue *aq, void *stream)
    return 1;
  }

-struct goacc_asyncqueue *
-GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+static struct goacc_asyncqueue *
+nvptx_goacc_asyncqueue_construct (unsigned int flags)
  {
    CUstream stream = NULL;
-  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, CU_STREAM_DEFAULT);
+  CUDA_CALL_ERET (NULL, cuStreamCreate, &stream, flags);

    struct goacc_asyncqueue *aq
      = GOMP_PLUGIN_malloc (sizeof (struct goacc_asyncqueue));
@@ -1751,14 +1752,26 @@ GOMP_OFFLOAD_openacc_async_construct (int device 
__attribute__((unused)))
    return aq;
  }

-bool
-GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
+{
+  return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
+}
+
+static bool
+nvptx_goacc_asyncqueue_destruct (struct goacc_asyncqueue *aq)
  {
    CUDA_CALL_ERET (false, cuStreamDestroy, aq->cuda_stream);
    free (aq);
    return true;
  }

+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_destruct (aq);
+}
+
  int
  GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
  {
@@ -1772,13 +1785,19 @@ GOMP_OFFLOAD_openacc_async_test (struct 
goacc_asyncqueue *aq)
    return -1;
  }

-bool
-GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+static bool
+nvptx_goacc_asyncqueue_synchronize (struct goacc_asyncqueue *aq)
  {
    CUDA_CALL_ERET (false, cuStreamSynchronize, aq->cuda_stream);
    return true;
  }

+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  return nvptx_goacc_asyncqueue_synchronize (aq);
+}
+
  bool
  GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
                                    struct goacc_asyncqueue *aq2)
@@ -2038,22 +2057,6 @@ nvptx_stacks_acquire (struct ptx_device *ptx_dev, size_t 
size, int num)
  }


-void
-rev_off_dev_to_host_cpy (void *dest, const void *src, size_t size,
-                      CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyDtoHAsync, dest, (CUdeviceptr) src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
-void
-rev_off_host_to_dev_cpy (void *dest, const void *src, size_t size,
-                      CUstream stream)
-{
-  CUDA_CALL_ASSERT (cuMemcpyHtoDAsync, (CUdeviceptr) dest, src, size, stream);
-  CUDA_CALL_ASSERT (cuStreamSynchronize, stream);
-}
-
  void
  GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
  {
@@ -2087,9 +2090,17 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, 
void **args)
      }
    nvptx_adjust_launch_bounds (tgt_fn, ptx_dev, &teams, &threads);

-  size_t stack_size = nvptx_stacks_size ();
    bool reverse_offload = ptx_dev->rev_data != NULL;
-  CUstream copy_stream = NULL;
+  struct goacc_asyncqueue *reverse_offload_aq = NULL;
+  if (reverse_offload)
+    {
+      reverse_offload_aq
+     = nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
+      if (!reverse_offload_aq)
+     exit (EXIT_FAILURE);
+    }
+
+  size_t stack_size = nvptx_stacks_size ();

    pthread_mutex_lock (&ptx_dev->omp_stacks.lock);
    void *stacks = nvptx_stacks_acquire (ptx_dev, stack_size, teams * threads);
@@ -2103,8 +2114,6 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, 
void **args)
    GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
                   " [(teams: %u), 1, 1] [(lanes: 32), (threads: %u), 1]\n",
                   __FUNCTION__, fn_name, teams, threads);
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamCreate, &copy_stream, CU_STREAM_NON_BLOCKING);
    r = CUDA_CALL_NOCHECK (cuLaunchKernel, function, teams, 1, 1,
                       32, threads, 1, 0, NULL, NULL, config);
    if (r != CUDA_SUCCESS)
@@ -2127,17 +2136,15 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void 
*tgt_vars, void **args)
          GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
                                  rev_data->addrs, rev_data->sizes,
                                  rev_data->kinds, rev_data->dev_num,
-                                 rev_off_dev_to_host_cpy,
-                                 rev_off_host_to_dev_cpy, copy_stream);
-         CUDA_CALL_ASSERT (cuStreamSynchronize, copy_stream);
+                                 reverse_offload_aq);
+         if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
+           exit (EXIT_FAILURE);
          __atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
        }
      usleep (1);
        }
    else
      r = CUDA_CALL_NOCHECK (cuCtxSynchronize, );
-  if (reverse_offload)
-    CUDA_CALL_ASSERT (cuStreamDestroy, copy_stream);
    if (r == CUDA_ERROR_LAUNCH_FAILED)
      GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r),
                     maybe_abort_msg);
@@ -2145,6 +2152,12 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, 
void **args)
      GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r));

    pthread_mutex_unlock (&ptx_dev->omp_stacks.lock);
+
+  if (reverse_offload)
+    {
+      if (!nvptx_goacc_asyncqueue_destruct (reverse_offload_aq))
+     exit (EXIT_FAILURE);
+    }
  }

  /* TODO: Implement GOMP_OFFLOAD_async_run. */
diff --git a/libgomp/target.c b/libgomp/target.c
index 79ed64a5dc3..e02188cf7e1 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -3312,9 +3312,7 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t 
*devaddrs,
  void
  gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
               uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
-              void (*dev_to_host_cpy) (void *, const void *, size_t, void*),
-              void (*host_to_dev_cpy) (void *, const void *, size_t, void*),
-              void *token)
+              struct goacc_asyncqueue *aq)
  {
    /* Return early if there is no offload code.  */
    if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3356,26 +3354,17 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, 
uint64_t devaddrs_ptr,
        devaddrs = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
        sizes = (uint64_t *) gomp_malloc (mapnum * sizeof (uint64_t));
        kinds = (unsigned short *) gomp_malloc (mapnum * sizeof (unsigned 
short));
-      if (dev_to_host_cpy)
-     {
-       dev_to_host_cpy (devaddrs, (const void *) (uintptr_t) devaddrs_ptr,
-                        mapnum * sizeof (uint64_t), token);
-       dev_to_host_cpy (sizes, (const void *) (uintptr_t) sizes_ptr,
-                        mapnum * sizeof (uint64_t), token);
-       dev_to_host_cpy (kinds, (const void *) (uintptr_t) kinds_ptr,
-                        mapnum * sizeof (unsigned short), token);
-     }
-      else
-     {
-       gomp_copy_dev2host (devicep, NULL, devaddrs,
-                           (const void *) (uintptr_t) devaddrs_ptr,
-                           mapnum * sizeof (uint64_t));
-       gomp_copy_dev2host (devicep, NULL, sizes,
-                           (const void *) (uintptr_t) sizes_ptr,
-                           mapnum * sizeof (uint64_t));
-       gomp_copy_dev2host (devicep, NULL, kinds, (const void *) (uintptr_t) 
kinds_ptr,
-                           mapnum * sizeof (unsigned short));
-     }
+      gomp_copy_dev2host (devicep, aq, devaddrs,
+                       (const void *) (uintptr_t) devaddrs_ptr,
+                       mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, sizes,
+                       (const void *) (uintptr_t) sizes_ptr,
+                       mapnum * sizeof (uint64_t));
+      gomp_copy_dev2host (devicep, aq, kinds,
+                       (const void *) (uintptr_t) kinds_ptr,
+                       mapnum * sizeof (unsigned short));
+      if (aq && !devicep->openacc.async.synchronize_func (aq))
+     exit (EXIT_FAILURE);
      }

    size_t tgt_align = 0, tgt_size = 0;
@@ -3402,13 +3391,14 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, 
uint64_t devaddrs_ptr,
          if (devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
            memcpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
                    (size_t) sizes[i]);
-         else if (dev_to_host_cpy)
-           dev_to_host_cpy (tgt + tgt_size, (void *) (uintptr_t) devaddrs[i],
-                            (size_t) sizes[i], token);
          else
-           gomp_copy_dev2host (devicep, NULL, tgt + tgt_size,
-                               (void *) (uintptr_t) devaddrs[i],
-                               (size_t) sizes[i]);
+           {
+             gomp_copy_dev2host (devicep, aq, tgt + tgt_size,
+                                 (void *) (uintptr_t) devaddrs[i],
+                                 (size_t) sizes[i]);
+             if (aq && !devicep->openacc.async.synchronize_func (aq))
+               exit (EXIT_FAILURE);
+           }
          devaddrs[i] = (uint64_t) (uintptr_t) tgt + tgt_size;
          tgt_size = tgt_size + sizes[i];
          if ((devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
@@ -3498,15 +3488,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, 
uint64_t devaddrs_ptr,
                  || kind == GOMP_MAP_ALWAYS_TO
                  || kind == GOMP_MAP_ALWAYS_TOFROM)
                {
-                 if (dev_to_host_cpy)
-                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-                                    (void *) (uintptr_t) cdata[i].devaddr,
-                                    sizes[i], token);
-                 else
-                   gomp_copy_dev2host (devicep, NULL,
-                                       (void *) (uintptr_t) devaddrs[i],
-                                       (void *) (uintptr_t) cdata[i].devaddr,
-                                       sizes[i]);
+                 gomp_copy_dev2host (devicep, aq,
+                                     (void *) (uintptr_t) devaddrs[i],
+                                     (void *) (uintptr_t) cdata[i].devaddr,
+                                     sizes[i]);
+                 if (aq && !devicep->openacc.async.synchronize_func (aq))
+                   {
+                     gomp_mutex_unlock (&devicep->lock);
+                     exit (EXIT_FAILURE);
+                   }
                }
              if (struct_cpy)
                struct_cpy--;
@@ -3573,15 +3563,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, 
uint64_t devaddrs_ptr,
                  devaddrs[i]
                    = (uint64_t) (uintptr_t) gomp_aligned_alloc (align,
                                                                 sizes[i]);
-                 if (dev_to_host_cpy)
-                   dev_to_host_cpy ((void *) (uintptr_t) devaddrs[i],
-                                    (void *) (uintptr_t) cdata[i].devaddr,
-                                    sizes[i], token);
-                 else
-                   gomp_copy_dev2host (devicep, NULL,
-                                       (void *) (uintptr_t) devaddrs[i],
-                                       (void *) (uintptr_t) cdata[i].devaddr,
-                                       sizes[i]);
+                 gomp_copy_dev2host (devicep, aq,
+                                     (void *) (uintptr_t) devaddrs[i],
+                                     (void *) (uintptr_t) cdata[i].devaddr,
+                                     sizes[i]);
+                 if (aq && !devicep->openacc.async.synchronize_func (aq))
+                   {
+                     gomp_mutex_unlock (&devicep->lock);
+                     exit (EXIT_FAILURE);
+                   }
                }
              for (j = i + 1; j < mapnum; j++)
                {
@@ -3685,15 +3675,15 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, 
uint64_t devaddrs_ptr,
              /* FALLTHRU */
            case GOMP_MAP_FROM:
            case GOMP_MAP_TOFROM:
-             if (copy && host_to_dev_cpy)
-               host_to_dev_cpy ((void *) (uintptr_t) cdata[i].devaddr,
-                                (void *) (uintptr_t) devaddrs[i],
-                                sizes[i], token);
-             else if (copy)
-               gomp_copy_host2dev (devicep, NULL,
-                                   (void *) (uintptr_t) cdata[i].devaddr,
-                                   (void *) (uintptr_t) devaddrs[i],
-                                   sizes[i], false, NULL);
+             if (copy)
+               {
+                 gomp_copy_host2dev (devicep, aq,
+                                     (void *) (uintptr_t) cdata[i].devaddr,
+                                     (void *) (uintptr_t) devaddrs[i],
+                                     sizes[i], false, NULL);
+                 if (aq && !devicep->openacc.async.synchronize_func (aq))
+                   exit (EXIT_FAILURE);
+               }
            default:
              break;
          }
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955

Reply via email to