Hi Jakub,

--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -224,6 +224,8 @@ OMP_5.1 {
     omp_set_teams_thread_limit_8_;
     omp_get_teams_thread_limit;
     omp_get_teams_thread_limit_;
+    omp_target_memcpy_async;
+    omp_target_memcpy_rect_async;
  } OMP_5.0.2;

These should be added to OMP_5.1.1, not here.

Changed.

--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -272,6 +272,10 @@ extern int omp_target_is_present (const void *, int) 
__GOMP_NOTHROW;
  extern int omp_target_memcpy (void *, const void *, __SIZE_TYPE__,
                           __SIZE_TYPE__, __SIZE_TYPE__, int, int)
    __GOMP_NOTHROW;
+extern int omp_target_memcpy_async (void *, const void *, __SIZE_TYPE__,
+                                __SIZE_TYPE__, __SIZE_TYPE__, int, int,
+                                int, omp_depend_t*)

Formatting, space before *.

Changed.

+  __GOMP_NOTHROW;
  extern int omp_target_memcpy_rect (void *, const void *, __SIZE_TYPE__, int,
                                const __SIZE_TYPE__ *,
                                const __SIZE_TYPE__ *,
@@ -279,6 +283,14 @@ extern int omp_target_memcpy_rect (void *, const void *, 
__SIZE_TYPE__, int,
                                const __SIZE_TYPE__ *,
                                const __SIZE_TYPE__ *, int, int)
    __GOMP_NOTHROW;
+extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__,
+                                     int, const __SIZE_TYPE__ *,
+                                     const __SIZE_TYPE__ *,
+                                     const __SIZE_TYPE__ *,
+                                     const __SIZE_TYPE__ *,
+                                     const __SIZE_TYPE__ *, int, int, int,
+                                     omp_depend_t*)

Likewise.

Changed.

-int
-omp_target_memcpy (void *dst, const void *src, size_t length,
-               size_t dst_offset, size_t src_offset, int dst_device_num,
-               int src_device_num)
+static int
+omp_target_memcpy_check (void *dst, const void *src, int dst_device_num,
+                     int src_device_num,
+                     struct gomp_device_descr **dst_devicep,
+                     struct gomp_device_descr **src_devicep)
  {

Why does omp_target_memcpy_check need the dst and src arguments?  From what
I can see, they aren't used by it.

Good point, dst and src arguments are removed.

+typedef struct
+{
+  void *dst;
+  const void *src;
+  size_t length;
+  size_t dst_offset;
+  size_t src_offset;
+  struct gomp_device_descr *dst_devicep;
+  struct gomp_device_descr *src_devicep;
+} memcpy_t;

Please come up with some less generic name, struct omp_target_memcpy_data
or something similar.  Even the *_t suffix is problematic, as *_t is
reserved for the implementation.

Renamed "memcpy_t" into "omp_target_memcpy_data" and "memcpy_rect_t" into
"omp_target_memcpy_rect_data".

+
+void
+omp_target_memcpy_async_helper (void *args)

This should be static.

Changed for "omp_target_memcpy_async_helper" and
"omp_target_memcpy_rect_async_helper".

+{
+  memcpy_t *a = args;
+  int ret = omp_target_memcpy_copy (a->dst, a->src, a->length, a->dst_offset,
+                                a->src_offset, a->dst_devicep,
+                                a->src_devicep);
+  if (ret)
+    gomp_fatal ("asynchronous memcpy failed");

I'm not really sure killing the whole program if the copying failed is the
best action.  Has it been discussed on omp-lang?  Perhaps the APIs should
have a way how to propagate the result to the caller when it completes
somehow?

I agree that gomp_fatal is quite harsh here. Otherwise I am afraid that
undefined behaviour can result from silently ignoring copy failures. I agree
with Tobias to keep gomp_fatal for now (as I don't see any useful alternative
yet) and discuss a (general) approach for OpenMP (as Tobias triggered in
https://github.com/OpenMP/spec/issues/3286).

As Tobias suggested, I replaced the error messages with "omp_target_memcpy
failed" and "omp_target_memcpy_rect failed".

Even if we do that, the ret variable seems to be superfluos, just do
   if (omp_target_memcpy_copy (...))
     gomp_fatal (...);

Changed.


+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+
+  int check = omp_target_memcpy_check (dst, src, dst_device_num, 
src_device_num,
+                                   &dst_devicep, &src_devicep);
+  if (check)
+    return check;
+
+  void (*fn) (void *) = &omp_target_memcpy_async_helper;
+  void *data = NULL;
+  void (*cpyfn) (void *, void *) = NULL;
+  long arg_size = 0;
+  long arg_align = 0;
+  bool if_clause = false;
+  unsigned flags = 0;
+  int priority_arg = 0;
+  void *detach = NULL;
+
+  memcpy_t s = {
+    .dst = dst,
+    .src = src,
+    .length = length,
+    .dst_offset = dst_offset,
+    .src_offset = src_offset,
+    .dst_devicep = dst_devicep,
+    .src_devicep = src_devicep
+  };

I think we in libgomp try to use C89 and so declare vars first before other
statements.

Changed.

+  data = &s;
+
+  void *depend[depobj_count+5];

Spaces around + , i.e. depobj_count + 5

Corrected two occurrences.

+  depend[0] = 0;
+  depend[1] = (void*) ((uintptr_t) depobj_count);

Space before *.  The ()s around (uintptr_t) depobj_count
are superfluous.

Corrected two occurrences.


+  depend[2] = depend[3] = depend[4] = 0;
+  for (int i = 0; i < depobj_count; ++i)
+    depend[i+5] = &depobj_list[i];

i + 5

+
+  if (depobj_count > 0 && depobj_list != NULL)
+    flags |= GOMP_TASK_FLAG_DEPEND;
+
+  GOMP_task (fn, data, cpyfn, arg_size, arg_align, if_clause, flags, depend,
+         priority_arg, detach);

We need to make sure that GOMP_task doesn't go through PLT.
So, I think this needs to be ialias_call and task.c needs to add ialias for
GOMP_task.

I added "ialias (GOMP_task)" in task.c but this seems to be not sufficient for
ialias_call in target.c ? That's why I used ialias_redirect in target.c, is 
that ok?

Also, I must say I don't like very much using variables that you initialize
to constants and just pass to the call, either pass the constants directly
to the call, or use /*priority_arg=*/0, /*detach=*/NULL style.

Adjusted at multiple locations.


More importantly, I have no idea how this can work when you pass arg_size 0
and arg_align 0.  The s variable is in the current function frame, with
arg_size 0 nothing is really copied to the generated task.
arg_size should be sizeof (memcpy_t) and arg_align __alignof__ (memcpy_t)
(well, struct omp_target_memcpy_data).

The copy function of GOMP_task ("cpyfn") is not used here (set to NULL) and thus
also arg_size and arg_align are set to 0 since they are related to cpyfn if I
understand it correctly.

The actual copy process is done by "fn", i.e. omp_target_memcpy_async_helper,
that gets all necessary data through the "data" argument (2nd parameter of
GOMP_task). In case of cpyfn==NULL fn is directly applied to data. It seems that
cpyfn is only needed to change/backup data before fn is applied to it.


Also, it would be nice to avoid GOMP_task for the depobj_count == 0 case
at least sometimes (but perhaps that can be done incrementally) and instead
use some CUDA etc. asynchronous copy APIs.  We don't really need to wait
for anything in that case, and from OpenMP POV all we need to make sure is
that barrier/taskwait/taskgroup end will know about these "tasks" and
wait for them.  So, it can be implemented more like #pragma omp target nowait
instead of #pragma omp task that calls the synchronous omp_target_memcpy.
Though, maybe that is how it should be implemented always, something like
gomp_create_target_task and its caller.  We already use that single routine
for multiple purposes (target nowait as well as target enter/exit data
nowait), so just telling it somehow that it shouldn't do mapping/unmapping
and perhaps target execution and instead copying would be nice.

I dont't see/understand the advantage using gomp_create_target_task over
GOMP_task. Whether the task waits for dependencies
("gomp_task_maybe_wait_for_dependencies") depends on GOMP_TASK_FLAG_DEPEND which
is only set if depobj_count > 0 and depobj_list != NULL. Thus, there shouldn't
be any waiting in case of depobj_count == 0? Additionally, in both functions a
new thread is created - independently of dependencies.

Did I misunderstand/oversee something?

There indeed seem to exist async variants for cuMemcpyHtoD and cuMemcpyDtoH
(which are currently used in the nvptx plugin): cuMemcpyDtoHAsync and
cuMemcpyHtoDAsync. I suggest to keep this in mind for a separate patch as it
needs a bit more changes and testing on the plugins side.


E.g. if one uses omp_target_memcpy_async outside of any explicit parallel
or host teams, would be nice if it still was asynchronous and not
synchronous.  But even in explicit parallel, would be nice if we didn't
waste one of the threads waiting for it when it can do useful work on the
host.  It is true that for target nowait we have one unshackeled thread
usually that polls the device.  Though that is mainly because we need to
do some unmapping at the end of target nowait, including taking the lock
etc.  For the async copying maybe we don't need to take any lock and could
just arrange for check if already completed or sleep until completed if
possible, at least in the future.  For now at least handling it like target
nowait would be an improvement.

+} memcpy_rect_t;

Again, please use better type name.

Changed.


+
+void

And this should be static.

Changed.


+omp_target_memcpy_rect_async_helper (void *args)
+{
+  memcpy_rect_t *a = args;
+  int ret = omp_target_memcpy_rect_copy (a->dst, a->src, a->element_size,
+                                     a->num_dims, a->volume, a->dst_offsets,
+                                     a->src_offsets, a->dst_dimensions,
+                                     a->src_dimensions, a->dst_devicep,
+                                     a->src_devicep);
+  if (ret)
+    gomp_fatal ("asynchronous memcpy failed");

See above.

Updated error message as Tobias suggested.

An updated patch is attached (and tested again on x86_64-linux with nvptx and
amdgcn offloading without regression).

Marcel
-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 
München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas 
Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht 
München, HRB 106955
OpenMP, libgomp: Add new runtime routines omp_target_memcpy_async and
omp_target_memcpy_rect_async.

This patch adds two new OpenMP runtime routines: omp_target_memcpy_async and
omp_target_memcpy_rect_async. Both functions are introduced in OpenMP 5.1 as
asynchronous variants of omp_target_memcpy and omp_target_memcpy_rect.

In contrast to the synchronous variants, the asynchronous functions have two
additional function parameters to allow the specification of task dependences:

        int depobj_count
        omp_depend_t *depobj_list

        integer(c_int), value :: depobj_count
        integer(omp_depend_kind), optional :: depobj_list(*)

The implementation splits the synchronous functions into two parts: (a) check
and (b) copy. Then (a) is used in the asynchronous functions for the sequential
part, and the actual copy process (b) is executed in a new created task. The
sequential part (a) takes into account the requirements for the return values: 

"The routine returns zero if successful. Otherwise, it returns a non-zero
value." (omp_target_memcpy_async, OpenMP 5.1 spec, section 3.8.7)

"An application can determine the number of inclusive dimensions supported by an
implementation by passing NULL pointers (or C_NULL_PTR, for Fortran) for both
dst and src. The routine returns the number of dimensions supported by the
implementation for the specified device numbers. No copy operation is
performed." (omp_target_memcpy_rect_async, OpenMP 5.1 spec, section 3.8.8)

Due to asynchronicity an error is thrown if the asynchronous memcpy is not
successful (in contrast to the synchronous functions which use a return
value unequal to zero).

gcc/ChangeLog:

        * omp-low.cc (omp_runtime_api_call): Added target_memcpy_async and
        target_memcpy_rect_async to omp_runtime_apis array.

libgomp/ChangeLog:

        * libgomp.map: Added omp_target_memcpy_async and
        omp_target_memcpy_rect_async.
        * libgomp.texi: Both functions are now supported.
        * omp.h.in: Added omp_target_memcpy_async and
        omp_target_memcpy_rect_async.
        * omp_lib.f90.in: Added interfaces for both new functions.
        * omp_lib.h.in: Likewise.
        * target.c (ialias_redirect): Added for GOMP_task.
        (omp_target_memcpy): Restructured into check and copy part.
        (omp_target_memcpy_check): New helper function for omp_target_memcpy and
        omp_target_memcpy_async that checks requirements.
        (omp_target_memcpy_copy): New helper function for omp_target_memcpy and
        omp_target_memcpy_async that performs the memcpy.
        (omp_target_memcpy_async_helper): New helper function that is used in
        omp_target_memcpy_async for the asynchronous task.
        (omp_target_memcpy_async): Added.
        (omp_target_memcpy_rect): Restructured into check and copy part.
        (omp_target_memcpy_rect_check): New helper function for
        omp_target_memcpy_rect and omp_target_memcpy_rect_async that checks
        requirements.
        (omp_target_memcpy_rect_copy): New helper function for
        omp_target_memcpy_rect and omp_target_memcpy_rect_async that performs
        the memcpy.
        (omp_target_memcpy_rect_async_helper): New helper function that is used
        in omp_target_memcpy_rect_async for the asynchronous task.
        (omp_target_memcpy_rect_async): Added.
        * task.c (ialias): Added for GOMP_task.
        * testsuite/libgomp.c-c++-common/target-memcpy-async-1.c: New test.
        * testsuite/libgomp.c-c++-common/target-memcpy-async-2.c: New test.
        * testsuite/libgomp.c-c++-common/target-memcpy-rect-async-1.c: New test.
        * testsuite/libgomp.c-c++-common/target-memcpy-rect-async-2.c: New test.
        * testsuite/libgomp.fortran/target-memcpy-async-1.f90: New test.
        * testsuite/libgomp.fortran/target-memcpy-async-2.f90: New test.
        * testsuite/libgomp.fortran/target-memcpy-rect-async-1.f90: New test.
        * testsuite/libgomp.fortran/target-memcpy-rect-async-2.f90: New test.

diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index 4c52886..3682c4c 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -3990,7 +3990,9 @@ omp_runtime_api_call (const_tree fndecl)
       "target_free",
       "target_is_present",
       "target_memcpy",
+      "target_memcpy_async",
       "target_memcpy_rect",
+      "target_memcpy_rect_async",
       NULL,
       /* Now omp_* calls that are available as omp_* and omp_*_; however, the
         DECL_NAME is always omp_* without tailing underscore.  */
diff --git a/libgomp/libgomp.map b/libgomp/libgomp.map
index 608a54c..fd3c15e 100644
--- a/libgomp/libgomp.map
+++ b/libgomp/libgomp.map
@@ -229,6 +229,8 @@ OMP_5.1 {
 OMP_5.1.1 {
   global:
        omp_get_mapped_ptr;
+       omp_target_memcpy_async;
+       omp_target_memcpy_rect_async;
 } OMP_5.1;
 
 GOMP_1.0 {
diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index 38e0337..9322301 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -313,7 +313,7 @@ The OpenMP 4.5 specification is fully supported.
       routines @tab Y @tab
 @item @code{omp_target_is_accessible} runtime routine @tab N @tab
 @item @code{omp_target_memcpy_async} and @code{omp_target_memcpy_rect_async}
-      runtime routines @tab N @tab
+      runtime routines @tab Y @tab
 @item @code{omp_get_mapped_ptr} runtime routine @tab Y @tab
 @item @code{omp_calloc}, @code{omp_realloc}, @code{omp_aligned_alloc} and
       @code{omp_aligned_calloc} runtime routines @tab Y @tab
diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in
index 18d0152..cf93c97 100644
--- a/libgomp/omp.h.in
+++ b/libgomp/omp.h.in
@@ -272,6 +272,10 @@ extern int omp_target_is_present (const void *, int) 
__GOMP_NOTHROW;
 extern int omp_target_memcpy (void *, const void *, __SIZE_TYPE__,
                              __SIZE_TYPE__, __SIZE_TYPE__, int, int)
   __GOMP_NOTHROW;
+extern int omp_target_memcpy_async (void *, const void *, __SIZE_TYPE__,
+                                   __SIZE_TYPE__, __SIZE_TYPE__, int, int,
+                                   int, omp_depend_t *)
+  __GOMP_NOTHROW;
 extern int omp_target_memcpy_rect (void *, const void *, __SIZE_TYPE__, int,
                                   const __SIZE_TYPE__ *,
                                   const __SIZE_TYPE__ *,
@@ -279,6 +283,14 @@ extern int omp_target_memcpy_rect (void *, const void *, 
__SIZE_TYPE__, int,
                                   const __SIZE_TYPE__ *,
                                   const __SIZE_TYPE__ *, int, int)
   __GOMP_NOTHROW;
+extern int omp_target_memcpy_rect_async (void *, const void *, __SIZE_TYPE__,
+                                        int, const __SIZE_TYPE__ *,
+                                        const __SIZE_TYPE__ *,
+                                        const __SIZE_TYPE__ *,
+                                        const __SIZE_TYPE__ *,
+                                        const __SIZE_TYPE__ *, int, int, int,
+                                        omp_depend_t *)
+  __GOMP_NOTHROW;
 extern int omp_target_associate_ptr (const void *, const void *, __SIZE_TYPE__,
                                     __SIZE_TYPE__, int) __GOMP_NOTHROW;
 extern int omp_target_disassociate_ptr (const void *, int) __GOMP_NOTHROW;
diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in
index 506f15c..38e421c 100644
--- a/libgomp/omp_lib.f90.in
+++ b/libgomp/omp_lib.f90.in
@@ -799,6 +799,22 @@
         end interface
 
         interface
+          function omp_target_memcpy_async (dst, src, length, dst_offset, &
+                                            src_offset, dst_device_num, &
+                                            src_device_num, depobj_count, &
+                                            depobj_list) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            import :: omp_depend_kind
+            integer(c_int) :: omp_target_memcpy_async
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: length, dst_offset, src_offset
+            integer(c_int), value :: dst_device_num, src_device_num, &
+                                     depobj_count
+            integer(omp_depend_kind), optional :: depobj_list(*)
+          end function omp_target_memcpy_async
+        end interface
+
+        interface
           function omp_target_memcpy_rect (dst,src,element_size, num_dims, &
                                            volume, dst_offsets, src_offsets, &
                                            dst_dimensions, src_dimensions, &
@@ -816,6 +832,30 @@
         end interface
 
         interface
+          function omp_target_memcpy_rect_async (dst,src,element_size,     &
+                                                 num_dims, volume,         &
+                                                 dst_offsets, src_offsets, &
+                                                 dst_dimensions,           &
+                                                 src_dimensions,           &
+                                                 dst_device_num,           &
+                                                 src_device_num,           &
+                                                 depobj_count,             &
+                                                 depobj_list) bind(c)
+            use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+            import :: omp_depend_kind
+            integer(c_int) :: omp_target_memcpy_rect_async
+            type(c_ptr), value :: dst, src
+            integer(c_size_t), value :: element_size
+            integer(c_int), value :: num_dims, dst_device_num, src_device_num, 
&
+                                     depobj_count
+            integer(c_size_t), intent(in) :: volume(*), dst_offsets(*),  &
+                                             src_offsets(*), 
dst_dimensions(*), &
+                                             src_dimensions(*)
+            integer(omp_depend_kind), optional :: depobj_list(*)
+          end function omp_target_memcpy_rect_async
+        end interface
+
+        interface
           function omp_target_associate_ptr (host_ptr, device_ptr, size, &
                                              device_offset, device_num) bind(c)
             use, intrinsic :: iso_c_binding, only : c_ptr, c_size_t, c_int
diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in
index 0f48510..7b8058b 100644
--- a/libgomp/omp_lib.h.in
+++ b/libgomp/omp_lib.h.in
@@ -378,6 +378,22 @@
       end interface
 
       interface
+        function omp_target_memcpy_async (dst, src, length, dst_offset,    &
+     &                              src_offset, dst_device_num,            &
+     &                              src_device_num, depobj_count,          &
+     &                              depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          import :: omp_depend_kind
+          integer(c_int) :: omp_target_memcpy_async
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: length, dst_offset, src_offset
+          integer(c_int), value :: dst_device_num, src_device_num
+          integer(c_int), value :: depobj_count
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function omp_target_memcpy_async
+      end interface
+
+      interface
         function omp_target_memcpy_rect (dst,src,element_size, num_dims,   &
      &                                   volume, dst_offsets,              &
      &                                   src_offsets, dst_dimensions,      &
@@ -397,6 +413,31 @@
       end interface
 
       interface
+        function omp_target_memcpy_rect_async (dst,src,element_size,        &
+     &                                         num_dims, volume,            &
+     &                                         dst_offsets, src_offsets,    &
+     &                                         dst_dimensions,              &
+     &                                         src_dimensions,              &
+     &                                         dst_device_num,              &
+     &                                         src_device_num,              &
+     &                                         depobj_count,                &
+     &                                         depobj_list) bind(c)
+          use, intrinsic :: iso_c_binding, only : c_ptr, c_int, c_size_t
+          import :: omp_depend_kind
+          integer(c_int) :: omp_target_memcpy_rect_async
+          type(c_ptr), value :: dst, src
+          integer(c_size_t), value :: element_size
+          integer(c_int), value :: num_dims, depobj_count
+          integer(c_int), value :: dst_device_num, src_device_num
+          integer(c_size_t), intent(in) :: volume(*), dst_offsets(*)
+          integer(c_size_t), intent(in) :: src_offsets(*)
+          integer(c_size_t), intent(in) :: dst_dimensions(*)
+          integer(c_size_t), intent(in) :: src_dimensions(*)
+          integer(omp_depend_kind), optional :: depobj_list(*)
+        end function omp_target_memcpy_rect_async
+      end interface
+
+      interface
         function omp_target_associate_ptr (host_ptr, device_ptr, size,     &
      &                                     device_offset, device_num)      &
      &      bind(c)
diff --git a/libgomp/target.c b/libgomp/target.c
index 86930ea..4661f5e 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -49,6 +49,8 @@ static inline void * htab_alloc (size_t size) { return 
gomp_malloc (size); }
 static inline void htab_free (void *ptr) { free (ptr); }
 #include "hashtab.h"
 
+ialias_redirect (GOMP_task)
+
 static inline hashval_t
 htab_hash (hash_entry_type element)
 {
@@ -3355,40 +3357,49 @@ omp_target_is_present (const void *ptr, int device_num)
   return ret;
 }
 
-int
-omp_target_memcpy (void *dst, const void *src, size_t length,
-                  size_t dst_offset, size_t src_offset, int dst_device_num,
-                  int src_device_num)
+static int
+omp_target_memcpy_check (int dst_device_num, int src_device_num,
+                        struct gomp_device_descr **dst_devicep,
+                        struct gomp_device_descr **src_devicep)
 {
-  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
-  bool ret;
-
   if (dst_device_num != gomp_get_num_devices ())
     {
       if (dst_device_num < 0)
        return EINVAL;
 
-      dst_devicep = resolve_device (dst_device_num);
-      if (dst_devicep == NULL)
+      *dst_devicep = resolve_device (dst_device_num);
+      if (*dst_devicep == NULL)
        return EINVAL;
 
-      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-         || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-       dst_devicep = NULL;
+      if (!((*dst_devicep)->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+         || (*dst_devicep)->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+       *dst_devicep = NULL;
     }
+
   if (src_device_num != num_devices_openmp)
     {
       if (src_device_num < 0)
        return EINVAL;
 
-      src_devicep = resolve_device (src_device_num);
-      if (src_devicep == NULL)
+      *src_devicep = resolve_device (src_device_num);
+      if (*src_devicep == NULL)
        return EINVAL;
 
-      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-         || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-       src_devicep = NULL;
+      if (!((*src_devicep)->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
+         || (*src_devicep)->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
+       *src_devicep = NULL;
     }
+
+  return 0;
+}
+
+static int
+omp_target_memcpy_copy (void *dst, const void *src, size_t length,
+                       size_t dst_offset, size_t src_offset,
+                       struct gomp_device_descr *dst_devicep,
+                       struct gomp_device_descr *src_devicep)
+{
+  bool ret;
   if (src_devicep == NULL && dst_devicep == NULL)
     {
       memcpy ((char *) dst + dst_offset, (char *) src + src_offset, length);
@@ -3424,6 +3435,90 @@ omp_target_memcpy (void *dst, const void *src, size_t 
length,
   return EINVAL;
 }
 
+int
+omp_target_memcpy (void *dst, const void *src, size_t length, size_t 
dst_offset,
+                  size_t src_offset, int dst_device_num, int src_device_num)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+  int ret;
+
+  ret = omp_target_memcpy_check (dst_device_num, src_device_num, &dst_devicep,
+                                &src_devicep);
+
+  if (ret)
+    return ret;
+
+  ret = omp_target_memcpy_copy (dst, src, length, dst_offset, src_offset,
+                               dst_devicep, src_devicep);
+
+  return ret;
+}
+
+typedef struct
+{
+  void *dst;
+  const void *src;
+  size_t length;
+  size_t dst_offset;
+  size_t src_offset;
+  struct gomp_device_descr *dst_devicep;
+  struct gomp_device_descr *src_devicep;
+} omp_target_memcpy_data;
+
+static void
+omp_target_memcpy_async_helper (void *args)
+{
+  omp_target_memcpy_data *a = args;
+  if (omp_target_memcpy_copy (a->dst, a->src, a->length, a->dst_offset,
+                             a->src_offset, a->dst_devicep, a->src_devicep))
+    gomp_fatal ("omp_target_memcpy failed");
+}
+
+int
+omp_target_memcpy_async (void *dst, const void *src, size_t length,
+                        size_t dst_offset, size_t src_offset,
+                        int dst_device_num, int src_device_num,
+                        int depobj_count, omp_depend_t *depobj_list)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+  void (*fn) (void *) = &omp_target_memcpy_async_helper;
+  unsigned int flags = 0;
+  void *data;
+  void *depend[depobj_count + 5];
+  int i;
+  int check = omp_target_memcpy_check (dst_device_num, src_device_num,
+                                      &dst_devicep, &src_devicep);
+
+  omp_target_memcpy_data s = {
+    .dst = dst,
+    .src = src,
+    .length = length,
+    .dst_offset = dst_offset,
+    .src_offset = src_offset,
+    .dst_devicep = dst_devicep,
+    .src_devicep = src_devicep
+  };
+  data = &s;
+
+  if (check)
+    return check;
+
+  depend[0] = 0;
+  depend[1] = (void *) (uintptr_t) depobj_count;
+  depend[2] = depend[3] = depend[4] = 0;
+  for (i = 0; i < depobj_count; ++i)
+    depend[i + 5] = &depobj_list[i];
+
+  if (depobj_count > 0 && depobj_list != NULL)
+    flags |= GOMP_TASK_FLAG_DEPEND;
+
+  GOMP_task (fn, data, /*cpyfn=*/NULL, /*arg_size=*/0, /*arg_align=*/0,
+            /*if_clause=*/false, flags, depend, /*priority_arg=*/0,
+            /*detach=*/NULL);
+
+  return 0;
+}
+
 static int
 omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size,
                               int num_dims, const size_t *volume,
@@ -3500,50 +3595,36 @@ omp_target_memcpy_rect_worker (void *dst, const void 
*src, size_t element_size,
   return 0;
 }
 
-int
-omp_target_memcpy_rect (void *dst, const void *src, size_t element_size,
-                       int num_dims, const size_t *volume,
-                       const size_t *dst_offsets,
-                       const size_t *src_offsets,
-                       const size_t *dst_dimensions,
-                       const size_t *src_dimensions,
-                       int dst_device_num, int src_device_num)
+static int
+omp_target_memcpy_rect_check (void *dst, const void *src, int dst_device_num,
+                             int src_device_num,
+                             struct gomp_device_descr **dst_devicep,
+                             struct gomp_device_descr **src_devicep)
 {
-  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
-
   if (!dst && !src)
     return INT_MAX;
 
-  if (dst_device_num != gomp_get_num_devices ())
-    {
-      if (dst_device_num < 0)
-       return EINVAL;
-
-      dst_devicep = resolve_device (dst_device_num);
-      if (dst_devicep == NULL)
-       return EINVAL;
-
-      if (!(dst_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-         || dst_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-       dst_devicep = NULL;
-    }
-  if (src_device_num != num_devices_openmp)
-    {
-      if (src_device_num < 0)
-       return EINVAL;
-
-      src_devicep = resolve_device (src_device_num);
-      if (src_devicep == NULL)
-       return EINVAL;
-
-      if (!(src_devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
-         || src_devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM)
-       src_devicep = NULL;
-    }
+  int ret = omp_target_memcpy_check (dst_device_num, src_device_num,
+                                    dst_devicep, src_devicep);
+  if (ret)
+    return ret;
 
-  if (src_devicep != NULL && dst_devicep != NULL && src_devicep != dst_devicep)
+  if (*src_devicep != NULL && *dst_devicep != NULL && *src_devicep != 
*dst_devicep)
     return EINVAL;
 
+  return 0;
+}
+
+static int
+omp_target_memcpy_rect_copy (void *dst, const void *src,
+                            size_t element_size, int num_dims,
+                            const size_t *volume, const size_t *dst_offsets,
+                            const size_t *src_offsets,
+                            const size_t *dst_dimensions,
+                            const size_t *src_dimensions,
+                            struct gomp_device_descr *dst_devicep,
+                            struct gomp_device_descr *src_devicep)
+{
   if (src_devicep)
     gomp_mutex_lock (&src_devicep->lock);
   else if (dst_devicep)
@@ -3556,9 +3637,119 @@ omp_target_memcpy_rect (void *dst, const void *src, 
size_t element_size,
     gomp_mutex_unlock (&src_devicep->lock);
   else if (dst_devicep)
     gomp_mutex_unlock (&dst_devicep->lock);
+
+  return ret;
+}
+
+int
+omp_target_memcpy_rect (void *dst, const void *src, size_t element_size,
+                       int num_dims, const size_t *volume,
+                       const size_t *dst_offsets,
+                       const size_t *src_offsets,
+                       const size_t *dst_dimensions,
+                       const size_t *src_dimensions,
+                       int dst_device_num, int src_device_num)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+
+  int check = omp_target_memcpy_rect_check (dst, src, dst_device_num,
+                                           src_device_num, &dst_devicep,
+                                           &src_devicep);
+
+  if (check)
+    return check;
+
+  int ret = omp_target_memcpy_rect_copy (dst, src, element_size, num_dims,
+                                        volume, dst_offsets, src_offsets,
+                                        dst_dimensions, src_dimensions,
+                                        dst_devicep, src_devicep);
+
   return ret;
 }
 
+typedef struct
+{
+  void *dst;
+  const void *src;
+  size_t element_size;
+  int num_dims;
+  const size_t *volume;
+  const size_t *dst_offsets;
+  const size_t *src_offsets;
+  const size_t *dst_dimensions;
+  const size_t *src_dimensions;
+  struct gomp_device_descr *dst_devicep;
+  struct gomp_device_descr *src_devicep;
+} omp_target_memcpy_rect_data;
+
+static void
+omp_target_memcpy_rect_async_helper (void *args)
+{
+  omp_target_memcpy_rect_data *a = args;
+  int ret = omp_target_memcpy_rect_copy (a->dst, a->src, a->element_size,
+                                        a->num_dims, a->volume, a->dst_offsets,
+                                        a->src_offsets, a->dst_dimensions,
+                                        a->src_dimensions, a->dst_devicep,
+                                        a->src_devicep);
+  if (ret)
+    gomp_fatal ("omp_target_memcpy_rect failed");
+}
+
+int
+omp_target_memcpy_rect_async (void *dst, const void *src, size_t element_size,
+                             int num_dims, const size_t *volume,
+                             const size_t *dst_offsets,
+                             const size_t *src_offsets,
+                             const size_t *dst_dimensions,
+                             const size_t *src_dimensions,
+                             int dst_device_num, int src_device_num,
+                             int depobj_count, omp_depend_t *depobj_list)
+{
+  struct gomp_device_descr *dst_devicep = NULL, *src_devicep = NULL;
+
+  void (*fn) (void *) = &omp_target_memcpy_rect_async_helper;
+  unsigned flags = 0;
+  int check = omp_target_memcpy_rect_check (dst, src, dst_device_num,
+                                           src_device_num, &dst_devicep,
+                                           &src_devicep);
+  void *data;
+  void *depend[depobj_count + 5];
+  int i;
+
+  omp_target_memcpy_rect_data s = {
+    .dst = dst,
+    .src = src,
+    .element_size = element_size,
+    .num_dims = num_dims,
+    .volume = volume,
+    .dst_offsets = dst_offsets,
+    .src_offsets = src_offsets,
+    .dst_dimensions = dst_dimensions,
+    .src_dimensions = src_dimensions,
+    .dst_devicep = dst_devicep,
+    .src_devicep = src_devicep
+  };
+  data = &s;
+
+  if (check)
+    return check;
+
+  depend[0] = 0;
+  depend[1] = (void *) (uintptr_t) depobj_count;
+  depend[2] = depend[3] = depend[4] = 0;
+  for (i = 0; i < depobj_count; ++i)
+    depend[i + 5] = &depobj_list[i];
+
+  if (depobj_count > 0 && depobj_list != NULL)
+    flags |= GOMP_TASK_FLAG_DEPEND;
+
+  GOMP_task (fn, data, /*cpyfn=*/NULL, /*arg_size=*/0, /*arg_align=*/0,
+            /*if_clause=*/false, flags, depend, /*priority_arg=*/0,
+            /*detach=*/NULL);
+
+  return 0;
+}
+
 int
 omp_target_associate_ptr (const void *host_ptr, const void *device_ptr,
                          size_t size, size_t device_offset, int device_num)
diff --git a/libgomp/task.c b/libgomp/task.c
index 828348c..d1bb3ba 100644
--- a/libgomp/task.c
+++ b/libgomp/task.c
@@ -581,6 +581,7 @@ GOMP_task (void (*fn) (void *), void *data, void (*cpyfn) 
(void *, void *),
     }
 }
 
+ialias (GOMP_task)
 ialias (GOMP_taskgroup_start)
 ialias (GOMP_taskgroup_end)
 ialias (GOMP_taskgroup_reduction_register)
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-1.c 
b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-1.c
new file mode 100644
index 0000000..f25c3bb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-1.c
@@ -0,0 +1,46 @@
+/* Test for omp_target_memcpy_async without considering dependence objects.  */
+
+#include <omp.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = i;
+
+  if (omp_target_memcpy_async (p, q, 128 * sizeof (int), sizeof (int), 0, d, 
id,
+      0, NULL))
+    abort ();
+
+  #pragma omp taskwait
+
+  int q2[128];
+  for (i = 0; i < 128; ++i)
+    q2[i] = 0;
+  if (omp_target_memcpy_async (q2, p, 128 * sizeof(int), 0, sizeof (int), id, 
d,
+      0, NULL))
+    abort ();
+
+  #pragma omp taskwait
+
+  for (i = 0; i < 128; ++i)
+    if (q2[i] != q[i])
+      abort ();
+
+  omp_target_free (p, d);
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-2.c 
b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-2.c
new file mode 100644
index 0000000..d1353a5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-async-2.c
@@ -0,0 +1,74 @@
+/* Test for omp_target_memcpy_async considering dependence objects.  */
+
+#include <omp.h>
+#include <stdlib.h>
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[32], e[16], q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; ++i)
+    a[i] = i + 1;
+  for (i = 0; i < 64; ++i)
+    b[i] = i + 2;
+  for (i = 0; i < 32; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = i + 4;
+
+  omp_depend_t obj[2];
+
+  #pragma omp parallel num_threads(5)
+  #pragma omp single
+  {
+    #pragma omp task depend(out: p)
+    omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+
+    #pragma omp task depend(inout: p)
+    omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+
+    #pragma omp task depend(out: c)
+    for (i = 0; i < 32; i++)
+      c[i] = i + 3;
+
+    #pragma omp depobj(obj[0]) depend(inout: p)
+    #pragma omp depobj(obj[1]) depend(in: c)
+    omp_target_memcpy_async (p, c, 32 * sizeof (int), 0, 0, d, id, 2, obj);
+
+    #pragma omp task depend(in: p)
+    omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+
+  #pragma omp taskwait
+
+  for (i = 0; i < 128; ++i)
+    q[i] = 0;
+  omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d);
+  for (i = 0; i < 16; ++i)
+    if (q[i] != i + 4)
+      abort ();
+  for (i = 16; i < 32; ++i)
+    if (q[i] != i + 3)
+      abort ();
+  for (i = 32; i < 64; ++i)
+    if (q[i] != i + 2)
+      abort ();
+  for (i = 64; i < 128; ++i)
+    if (q[i] != i + 1)
+      abort ();
+
+  omp_target_free (p, d);
+
+  return 0;
+}
diff --git 
a/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-1.c 
b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-1.c
new file mode 100644
index 0000000..176bceb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-1.c
@@ -0,0 +1,68 @@
+/* Test for omp_target_memcpy_rect_async without considering dependence
+   objects.  */
+
+#include <omp.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int q[128], q2[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  if (omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, NULL,
+                                   NULL, d, id, 0, NULL) < 3
+      || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, 
NULL,
+                                      NULL, id, d, 0, NULL) < 3
+      || omp_target_memcpy_rect_async (NULL, NULL, 0, 0, NULL, NULL, NULL, 
NULL,
+                                      NULL, id, id, 0, NULL) < 3)
+    abort ();
+
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+    abort ();
+
+  for (i = 0; i < 128; i++)
+    q[i] = i + 1;
+
+  size_t volume[NUM_DIMS] = { 1, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+
+  if (omp_target_memcpy_rect_async (p, q, sizeof (int), NUM_DIMS, volume,
+                                   dst_offsets, src_offsets, dst_dimensions,
+                                   src_dimensions, d, id, 0, NULL) != 0)
+    abort ();
+
+  #pragma omp taskwait
+
+  for (i = 0; i < 128; i++)
+    q2[i] = 0;
+  if (omp_target_memcpy (q2, p, 128 * sizeof (int), 0, 0, id, d) != 0)
+    abort ();
+
+  /* q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0  */
+  if (q2[0] != 1 || q2[1] != 2 || q2[2] !=3 || q2[3] != 0 || q2[4] != 0
+      || q2[5] != 5 || q2[6] != 6 || q2[7] != 7)
+    abort ();
+  for (i = 8; i < 128; ++i)
+    if (q2[i] != 0)
+      abort ();
+
+  omp_target_free (p, d);
+  return 0;
+}
diff --git 
a/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-2.c 
b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-2.c
new file mode 100644
index 0000000..4a5d80f
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/target-memcpy-rect-async-2.c
@@ -0,0 +1,91 @@
+/* Test for omp_target_memcpy_rect_async considering dependence objects.  */
+
+#include <omp.h>
+#include <stdlib.h>
+
+#define NUM_DIMS 3
+
+int
+main ()
+{
+  int d = omp_get_default_device ();
+  int id = omp_get_initial_device ();
+  int a[128], b[64], c[128], e[16], q[128], i;
+  void *p;
+
+  if (d < 0 || d >= omp_get_num_devices ())
+    d = id;
+
+  p = omp_target_alloc (130 * sizeof (int), d);
+  if (p == NULL)
+    return 0;
+
+  for (i = 0; i < 128; i++)
+    q[i] = 0;
+  if (omp_target_memcpy (p, q, 128 * sizeof (int), 0, 0, d, id) != 0)
+    abort ();
+
+  size_t volume[NUM_DIMS] = { 2, 2, 3 };
+  size_t dst_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t src_offsets[NUM_DIMS] = { 0, 0, 0 };
+  size_t dst_dimensions[NUM_DIMS] = { 3, 4, 5 };
+  size_t src_dimensions[NUM_DIMS] = { 2, 3, 4 };
+
+  for (i = 0; i < 128; i++)
+    a[i] = 42;
+  for (i = 0; i < 64; i++)
+    b[i] = 24;
+  for (i = 0; i < 128; i++)
+    c[i] = 0;
+  for (i = 0; i < 16; i++)
+    e[i] = 77;
+
+  omp_depend_t obj[2];
+
+  #pragma omp parallel num_threads(5)
+  #pragma omp single
+  {
+    #pragma omp task depend (out: p)
+    omp_target_memcpy (p, a, 128 * sizeof (int), 0, 0, d, id);
+
+    #pragma omp task depend(inout: p)
+    omp_target_memcpy (p, b, 64 * sizeof (int), 0, 0, d, id);
+
+    #pragma omp task depend(out: c)
+    for (i = 0; i < 128; i++)
+      c[i] = i + 1;
+
+    #pragma omp depobj(obj[0]) depend(inout: p)
+    #pragma omp depobj(obj[1]) depend(in: c)
+
+    /*  This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+                      13 14 15 - - 17 18 19 - - at positions 20..29.  */
+    omp_target_memcpy_rect_async (p, c, sizeof (int), NUM_DIMS, volume,
+                                 dst_offsets, src_offsets, dst_dimensions,
+                                 src_dimensions, d, id, 2, obj);
+
+    #pragma omp task depend(in: p)
+    omp_target_memcpy (p, e, 16 * sizeof (int), 0, 0, d, id);
+  }
+
+  #pragma omp taskwait
+
+  if (omp_target_memcpy (q, p, 128 * sizeof(int), 0, 0, id, d) != 0)
+    abort ();
+
+  for (i = 0; i < 16; ++i)
+    if (q[i] != 77)
+      abort ();
+  if (q[20] != 13 || q[21] != 14 || q[22] != 15 || q[25] != 17 || q[26] != 18
+      || q[27] != 19)
+    abort ();
+  for (i = 28; i < 64; ++i)
+    if (q[i] != 24)
+      abort ();
+  for (i = 64; i < 128; ++i)
+    if (q[i] != 42)
+      abort ();
+
+  omp_target_free (p, d);
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-async-1.f90 
b/libgomp/testsuite/libgomp.fortran/target-memcpy-async-1.f90
new file mode 100644
index 0000000..4679fd2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-async-1.f90
@@ -0,0 +1,42 @@
+! Test for omp_target_memcpy_async without considering dependence objects.
+
+program main
+  use omp_lib
+  use iso_c_binding
+  implicit none (external, type)
+  integer :: d, id, i, j
+  integer, target :: q(0:127), q2(0:127)
+  type(c_ptr) :: p
+  integer(omp_depend_kind) :: obj(1:0)
+
+  d = omp_get_default_device ()
+  id = omp_get_initial_device ()
+
+  if (d < 0 .or. d >= omp_get_num_devices ()) &
+    d = id
+
+  p = omp_target_alloc (130 * c_sizeof (q), d)
+  if (.not. c_associated (p)) &
+    stop 0  ! okay
+
+  q = [(i, i = 0, 127)]
+  if (omp_target_memcpy_async (p, c_loc (q), 128 * sizeof (q(0)), 0_c_size_t, &
+      0_c_size_t, d, id, 0, obj) /= 0) &
+    stop 1
+
+  !$omp taskwait
+
+  q2 = [(0, i = 0, 127)]
+  if (omp_target_memcpy_async (c_loc (q2), p, 128 * sizeof (q2(0)), 
0_c_size_t,&
+      0_c_size_t, id, d, 0, obj) /= 0) &
+    stop 2
+
+  !$omp taskwait
+
+  do j = 0, 127
+    if (q(j) /= q2(j)) &
+      stop 3
+  end do
+
+  call omp_target_free (p, d)
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-async-2.f90 
b/libgomp/testsuite/libgomp.fortran/target-memcpy-async-2.f90
new file mode 100644
index 0000000..2aa482a
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-async-2.f90
@@ -0,0 +1,91 @@
+! Test for omp_target_memcpy_async considering dependence objects.
+
+program main
+  use omp_lib
+  use iso_c_binding
+  implicit none (external, type)
+  integer :: d, id, i, j
+  integer, target :: a(0:127), b(0:63), c(0:31), e(0:15), q(0:127)
+  type(c_ptr) :: p
+  integer(omp_depend_kind) :: obj(0:1)
+
+  d = omp_get_default_device ()
+  id = omp_get_initial_device ()
+
+  if (d < 0 .or. d >= omp_get_num_devices ()) &
+    d = id
+
+  p = omp_target_alloc (130 * c_sizeof (q), d)
+  if (.not. c_associated (p)) &
+    stop 0  ! okay
+
+  a = [(i + 1, i = 0, 127)]
+  b = [(i + 2, i = 0, 63)]
+  c = [(0, i = 0, 31)]
+  e = [(i + 4, i = 0, 15)]
+
+  !$omp parallel num_threads(5)
+  !$omp single
+
+    !$omp task depend(out: p)
+    if (omp_target_memcpy (p, c_loc (a), 128 * sizeof (a(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 1
+    !$omp end task
+
+    !$omp task depend(inout: p)
+    if (omp_target_memcpy (p, c_loc (b), 64 * sizeof (b(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 2
+    !$omp end task
+
+    !$omp task depend(out: c)
+    do j = 0, 31
+      c(j) = j + 3
+    end do
+    !$omp end task
+
+    !$omp depobj(obj(0)) depend(inout: p)
+    !$omp depobj(obj(1)) depend(in: c)
+    if (omp_target_memcpy_async (p, c_loc (c), 32 * sizeof (c(0)), 0_c_size_t, 
&
+                                 0_c_size_t, d, id, 2, obj) /= 0) &
+      stop 3
+
+    !$omp task depend(in: p)
+    if (omp_target_memcpy (p, c_loc (e), 16 * sizeof (e(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 4
+    !$omp end task
+
+  !$omp end single
+  !$omp end parallel
+
+  !$omp taskwait
+
+  q = [(0, i = 0, 127)]
+  if (omp_target_memcpy (c_loc (q), p, 128 * sizeof (q(0)), 0_c_size_t, &
+                         0_c_size_t, id, d) /= 0) &
+    stop 5
+
+  do j = 0, 15
+    if (q(j) /= j+4) &
+      stop 10
+  end do
+
+  do j = 16, 31
+    if (q(j) /= j+3) &
+      stop 11
+  end do
+
+  do j = 32, 63
+    if (q(j) /= j+2) &
+      stop 12
+  end do
+
+  do j = 64, 127
+    if (q(j) /= j+1) &
+      stop 13
+  end do
+
+  call omp_target_free (p, d)
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-1.f90 
b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-1.f90
new file mode 100644
index 0000000..c8c87c5
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-1.f90
@@ -0,0 +1,86 @@
+! Test for omp_target_memcpy_rect_async without considering dependence objects.
+
+program main
+  use omp_lib
+  use iso_c_binding
+  implicit none (external, type)
+  integer :: d, id, i, j
+  integer, target :: q(0:127), q2(0:127)
+  type(c_ptr) :: p
+  integer(omp_depend_kind) :: obj(1:0)
+
+  integer(kind=c_size_t) :: volume(0:2)
+  integer(kind=c_size_t) :: dst_offsets(0:2)
+  integer(kind=c_size_t) :: src_offsets(0:2)
+  integer(kind=c_size_t) :: dst_dimensions(0:2)
+  integer(kind=c_size_t) :: src_dimensions(0:2)
+  integer(kind=c_size_t) :: empty(1:0)
+
+  d = omp_get_default_device ()
+  id = omp_get_initial_device ()
+
+  if (d < 0 .or. d >= omp_get_num_devices ()) &
+    d = id
+
+  p = omp_target_alloc (130 * c_sizeof (q), d)
+  if (.not. c_associated (p)) &
+    stop 0  ! okay
+
+  if (omp_target_memcpy_rect_async (C_NULL_PTR, C_NULL_PTR, 0_c_size_t, 0, &
+                                    empty, empty, empty, empty,  empty, d, id, 
&
+                                    0, obj) < 3 &
+     .or. omp_target_memcpy_rect_async (C_NULL_PTR, C_NULL_PTR, 0_c_size_t, 0, 
&
+                                        empty, empty, empty, empty, empty, &
+                                        id, d, 0, obj) < 3 &
+     .or. omp_target_memcpy_rect_async (C_NULL_PTR, C_NULL_PTR, 0_c_size_t, 0, 
&
+                                        empty, empty, empty, empty, empty, &
+                                        id, id, 0, obj) < 3) &
+    stop 1
+
+  q = [(0, i = 0, 127)]
+  if (omp_target_memcpy (p, c_loc (q), 128 * sizeof (q(0)), 0_c_size_t, &
+                         0_c_size_t, d, id) /= 0) &
+    stop 2
+
+  q = [(i+1, i = 0, 127)]
+
+  volume(2) = 3
+  volume(1) = 2
+  volume(0) = 1
+  dst_offsets(2) = 0
+  dst_offsets(1) = 0
+  dst_offsets(0) = 0
+  src_offsets(2) = 0
+  src_offsets(1) = 0
+  src_offsets(0) = 0
+  dst_dimensions(2) = 5
+  dst_dimensions(1) = 4
+  dst_dimensions(0) = 3
+  src_dimensions(2) = 4
+  src_dimensions(1) = 3
+  src_dimensions(0) = 2
+
+  if (omp_target_memcpy_rect_async (p, c_loc (q), sizeof (q(0)), 3, volume, &
+      dst_offsets, src_offsets, dst_dimensions, src_dimensions, d, id, 0, &
+      obj) /= 0) &
+    stop 3
+
+  !$omp taskwait
+
+  q2 = [(0, i = 0, 127)]
+  if (omp_target_memcpy (c_loc (q2), p, 128 * sizeof (q2(0)), 0_c_size_t, &
+                         0_c_size_t, id, d) /= 0) &
+    stop 4
+
+  ! q2 is expected to contain: 1 2 3 0 0 5 6 7 0 0 .. 0
+  if (q2(0) /= 1 .or. q2(1) /= 2 .or. q2(2) /= 3 .or. q2(3) /= 0 &
+      .or. q2(4) /= 0 .or. q2(5) /= 5 .or. q2(6) /= 6 .or. q2(7) /= 7) &
+    stop 5
+
+  do j = 8, 127
+    if (q2(j) /= 0) &
+      stop 6
+  end do
+
+  call omp_target_free (p, d)
+end program main
diff --git a/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-2.f90 
b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-2.f90
new file mode 100644
index 0000000..d0bc5ff
--- /dev/null
+++ b/libgomp/testsuite/libgomp.fortran/target-memcpy-rect-async-2.f90
@@ -0,0 +1,117 @@
+! Test for omp_target_memcpy_rect_async considering dependence objects.
+
+program main
+  use omp_lib
+  use iso_c_binding
+  implicit none (external, type)
+  integer :: d, id, i, j
+  integer, target :: a(0:127), b(0:63), c(0:127), e(0:15), q(0:127)
+  type(c_ptr) :: p
+  integer(omp_depend_kind) :: obj(0:2)
+
+  integer(kind=c_size_t) :: volume(0:2)
+  integer(kind=c_size_t) :: dst_offsets(0:2)
+  integer(kind=c_size_t) :: src_offsets(0:2)
+  integer(kind=c_size_t) :: dst_dimensions(0:2)
+  integer(kind=c_size_t) :: src_dimensions(0:2)
+
+  d = omp_get_default_device ()
+  id = omp_get_initial_device ()
+
+  if (d < 0 .or. d >= omp_get_num_devices ()) &
+    d = id
+
+  p = omp_target_alloc (130 * c_sizeof (q), d)
+  if (.not. c_associated (p)) &
+    stop 0  ! okay
+
+  a = [(42, i = 0, 127)]
+  b = [(24, i = 0, 63)]
+  c = [(0, i = 0, 127)]
+  e = [(77, i = 0, 15)]
+
+  volume(2) = 3
+  volume(1) = 2
+  volume(0) = 2
+  dst_offsets(2) = 0
+  dst_offsets(1) = 0
+  dst_offsets(0) = 0
+  src_offsets(2) = 0
+  src_offsets(1) = 0
+  src_offsets(0) = 0
+  dst_dimensions(2) = 5
+  dst_dimensions(1) = 4
+  dst_dimensions(0) = 3
+  src_dimensions(2) = 4
+  src_dimensions(1) = 3
+  src_dimensions(0) = 2
+
+  !$omp parallel num_threads(5)
+  !$omp single
+
+    !$omp task depend(out: p)
+    if (omp_target_memcpy (p, c_loc (a), 128 * sizeof (a(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 1
+    !$omp end task
+
+    !$omp task depend(inout: p)
+    if (omp_target_memcpy (p, c_loc (b), 64 * sizeof (b(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 2
+    !$omp end task
+
+    !$omp task depend(out: c)
+    do j = 0, 127
+      c(j) = j + 1
+    end do
+    !$omp end task
+
+    !$omp depobj(obj(0)) depend(inout: p)
+    !$omp depobj(obj(1)) depend(in: c)
+
+    ! This produces: 1 2 3 - - 5 6 7 - - at positions 0..9 and
+    !                13 14 15 - - 17 18 19 - - at positions 20..29.
+    if (omp_target_memcpy_rect_async (p, c_loc (c), sizeof (c(0)), 3, volume, &
+                                      dst_offsets, src_offsets, &
+                                      dst_dimensions, src_dimensions, d, id, &
+                                      2, obj) /= 0) &
+      stop 3
+
+    !$omp task depend(in: p)
+    if (omp_target_memcpy (p, c_loc (e), 16 * sizeof (e(0)), 0_c_size_t, &
+                           0_c_size_t, d, id) /= 0) &
+      stop 4
+    !$omp end task
+
+  !$omp end single
+  !$omp end parallel
+
+  !$omp taskwait
+
+  q = [(0, i = 0, 127)]
+  if (omp_target_memcpy (c_loc (q), p, 128 * sizeof (q(0)), 0_c_size_t, &
+                         0_c_size_t, id, d) /= 0) &
+    stop 5
+
+  do j = 0, 15
+    if (q(j) /= 77) &
+      stop 6
+  end do
+
+  if (q(20) /= 13 .or. q(21) /= 14 .or. q(22) /= 15 .or. q(25) /= 17 &
+      .or. q(26) /= 18 .or. q(27) /= 19) &
+    stop 7
+
+  do j = 28, 63
+    if (q(j) /= 24) &
+      stop 8
+  end do
+
+  do j = 64, 127
+    if (q(j) /= 42) &
+      stop 9
+  end do
+
+  call omp_target_free (p, d)
+end program main

Reply via email to