Hi!

On Tue, Sep 08, 2015 at 11:20:14 +0200, Jakub Jelinek wrote:
> nowait support for #pragma omp target is not implemented yet, supposedly we
> need to mark those somehow (some flag) already in the struct gomp_task
> structure, essentially it will need either 2 or 3 callbacks
> (the current one, executed when the dependencies are resolved (it actually
> waits until some thread schedules it after that point, I think it is
> undesirable to run it with the tasking lock held), which would perform
> the gomp_map_vars and initiate the running of the region, and then some
> query routine which would poll the plugin whether the task is done or not,
> and either perform the finalization (unmap_vars) if it is done (and in any
> case return bool whether it should be polled again or not), and if the
> finalization is not done there, also another callback for the finalization.
> Also, there is the issue that if we are waiting for task that needs to be
> polled, and we don't have any further tasks to run, we shouldn't really
> attempt to sleep on some semaphore (e.g. in taskwait, end of
> taskgroup, etc.) or barrier, but rather either need to keep polling it, or
> call the query hook with some argument that it should sleep in there until
> the work is done by the offloading device.
> Also, there needs to be a way for the target nowait first callback to say
> that it is using host fallback and thus acts as a normal task, therefore
> once the task fn finishes, the task is done.

Here is my WIP patch.  target.c part is obviously incorrect, but it demonstrates
a possible libgomp <-> plugin interface for running a target task function
asynchronously and checking whether it is completed or not.
(Refactored liboffloadmic/runtime/emulator from trunk is required to run
target-tmp.c testcase.)


diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index d798321..8e2b5aa 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -872,6 +872,8 @@ struct gomp_device_descr
   void *(*host2dev_func) (int, void *, const void *, size_t);
   void *(*dev2dev_func) (int, void *, const void *, size_t);
   void (*run_func) (int, void *, void *);
+  void (*async_run_func) (int, void *, void *, const void *);
+  bool (*async_is_completed_func) (int, const void *);
 
   /* Splay tree containing information about mapped memory regions.  */
   struct splay_tree_s mem_map;
diff --git a/libgomp/target.c b/libgomp/target.c
index 77bd442..31f034c 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -45,6 +45,10 @@
 #include "plugin-suffix.h"
 #endif
 
+/* FIXME: TMP */
+#include <stdio.h>
+#include <unistd.h>
+
 static void gomp_target_init (void);
 
 /* The whole initialization code for offloading plugins is only run one.  */
@@ -1227,6 +1231,44 @@ gomp_target_fallback (void (*fn) (void *), void 
**hostaddrs)
   *thr = old_thr;
 }
 
+/* Host fallback with firstprivate map-type handling.  */
+
+static void
+gomp_target_fallback_firstprivate (void (*fn) (void *), size_t mapnum,
+                                  void **hostaddrs, size_t *sizes,
+                                  unsigned short *kinds)
+{
+  size_t i, tgt_align = 0, tgt_size = 0;
+  char *tgt = NULL;
+  for (i = 0; i < mapnum; i++)
+    if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+      {
+       size_t align = (size_t) 1 << (kinds[i] >> 8);
+       if (tgt_align < align)
+         tgt_align = align;
+       tgt_size = (tgt_size + align - 1) & ~(align - 1);
+       tgt_size += sizes[i];
+      }
+  if (tgt_align)
+    {
+      tgt = gomp_alloca (tgt_size + tgt_align - 1);
+      uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
+      if (al)
+       tgt += tgt_align - al;
+      tgt_size = 0;
+      for (i = 0; i < mapnum; i++)
+       if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
+         {
+           size_t align = (size_t) 1 << (kinds[i] >> 8);
+           tgt_size = (tgt_size + align - 1) & ~(align - 1);
+           memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
+           hostaddrs[i] = tgt + tgt_size;
+           tgt_size = tgt_size + sizes[i];
+         }
+    }
+  gomp_target_fallback (fn, hostaddrs);
+}
+
 /* Helper function of GOMP_target{,_41} routines.  */
 
 static void *
@@ -1311,40 +1353,19 @@ GOMP_target_41 (int device, void (*fn) (void *), size_t 
mapnum,
   if (devicep == NULL
       || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     {
-      size_t i, tgt_align = 0, tgt_size = 0;
-      char *tgt = NULL;
-      for (i = 0; i < mapnum; i++)
-       if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-         {
-           size_t align = (size_t) 1 << (kinds[i] >> 8);
-           if (tgt_align < align)
-             tgt_align = align;
-           tgt_size = (tgt_size + align - 1) & ~(align - 1);
-           tgt_size += sizes[i];
-         }
-      if (tgt_align)
-       {
-         tgt = gomp_alloca (tgt_size + tgt_align - 1);
-         uintptr_t al = (uintptr_t) tgt & (tgt_align - 1);
-         if (al)
-           tgt += tgt_align - al;
-         tgt_size = 0;
-         for (i = 0; i < mapnum; i++)
-           if ((kinds[i] & 0xff) == GOMP_MAP_FIRSTPRIVATE)
-             {
-               size_t align = (size_t) 1 << (kinds[i] >> 8);
-               tgt_size = (tgt_size + align - 1) & ~(align - 1);
-               memcpy (tgt + tgt_size, hostaddrs[i], sizes[i]);
-               hostaddrs[i] = tgt + tgt_size;
-               tgt_size = tgt_size + sizes[i];
-             }
-       }
-      gomp_target_fallback (fn, hostaddrs);
+      gomp_target_fallback_firstprivate (fn, mapnum, hostaddrs, sizes, kinds);
       return;
     }
 
   void *fn_addr = gomp_get_target_fn_addr (devicep, fn);
 
+  if (flags & GOMP_TARGET_FLAG_NOWAIT)
+    {
+      gomp_create_target_task (devicep, fn_addr, mapnum, hostaddrs, sizes,
+                              kinds, flags, depend);
+      return;
+    }
+
   struct target_mem_desc *tgt_vars
     = gomp_map_vars (devicep, mapnum, hostaddrs, NULL, sizes, kinds, true,
                     GOMP_MAP_VARS_TARGET);
@@ -1636,34 +1657,58 @@ void
 gomp_target_task_fn (void *data)
 {
   struct gomp_target_task *ttask = (struct gomp_target_task *) data;
+  struct gomp_device_descr *devicep = ttask->devicep;
+
   if (ttask->fn != NULL)
     {
-      /* GOMP_target_41 */
+      if (devicep == NULL
+         || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+       {
+         /* FIXME: Save host fn addr into gomp_target_task?  */
+         gomp_target_fallback_firstprivate (NULL, ttask->mapnum,
+                                            ttask->hostaddrs, ttask->sizes,
+                                            ttask->kinds);
+         return;
+       }
+
+      struct target_mem_desc *tgt_vars
+       = gomp_map_vars (devicep, ttask->mapnum, ttask->hostaddrs, NULL,
+                        ttask->sizes, ttask->kinds, true,
+                        GOMP_MAP_VARS_TARGET);
+      devicep->async_run_func (devicep->target_id, ttask->fn,
+                              (void *) tgt_vars->tgt_start, data);
+
+      /* FIXME: TMP example of checking for completion.
+        Alternatively the plugin can set some completion flag in ttask.  */
+      while (!devicep->async_is_completed_func (devicep->target_id, data))
+       {
+         fprintf (stderr, "-");
+         usleep (100000);
+       }
     }
-  else if (ttask->devicep == NULL
-          || !(ttask->devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
+  else if (devicep == NULL
+          || !(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400))
     return;
 
   size_t i;
   if (ttask->flags & GOMP_TARGET_FLAG_UPDATE)
-    gomp_update (ttask->devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+    gomp_update (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
                 ttask->kinds, true);
   else if ((ttask->flags & GOMP_TARGET_FLAG_EXIT_DATA) == 0)
     for (i = 0; i < ttask->mapnum; i++)
       if ((ttask->kinds[i] & 0xff) == GOMP_MAP_STRUCT)
        {
-         gomp_map_vars (ttask->devicep, ttask->sizes[i] + 1,
-                        &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
-                        &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
+         gomp_map_vars (devicep, ttask->sizes[i] + 1, &ttask->hostaddrs[i],
+                        NULL, &ttask->sizes[i], &ttask->kinds[i], true,
+                        GOMP_MAP_VARS_ENTER_DATA);
          i += ttask->sizes[i];
        }
       else
-       gomp_map_vars (ttask->devicep, 1, &ttask->hostaddrs[i], NULL,
-                      &ttask->sizes[i], &ttask->kinds[i],
-                      true, GOMP_MAP_VARS_ENTER_DATA);
+       gomp_map_vars (devicep, 1, &ttask->hostaddrs[i], NULL, &ttask->sizes[i],
+                      &ttask->kinds[i], true, GOMP_MAP_VARS_ENTER_DATA);
   else
-    gomp_exit_data (ttask->devicep, ttask->mapnum, ttask->hostaddrs,
-                   ttask->sizes, ttask->kinds);
+    gomp_exit_data (devicep, ttask->mapnum, ttask->hostaddrs, ttask->sizes,
+                   ttask->kinds);
 }
 
 void
@@ -2108,6 +2153,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr 
*device,
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400)
     {
       DLSYM (run);
+      DLSYM (async_run);
+      DLSYM (async_is_completed);
       DLSYM (dev2dev);
     }
   if (device->capabilities & GOMP_OFFLOAD_CAP_OPENACC_200)
diff --git a/libgomp/testsuite/libgomp.c/target-tmp.c 
b/libgomp/testsuite/libgomp.c/target-tmp.c
new file mode 100644
index 0000000..23a739c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/target-tmp.c
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <unistd.h>
+
+#pragma omp declare target
+void foo (int n)
+{
+  printf ("Start tgt %d\n", n);
+  usleep (5000000);
+  printf ("End tgt %d\n", n);
+}
+#pragma omp end declare target
+
+int x, y, z;
+
+int main ()
+{
+  #pragma omp parallel
+  #pragma omp single
+    {
+      #pragma omp task depend(out: x)
+      printf ("Host task\n");
+
+      #pragma omp target nowait depend(in: x) depend(out: y)
+      foo (1);
+
+      #pragma omp target nowait depend(in: y)
+      foo (2);
+
+      #pragma omp target nowait depend(in: y)
+      foo (3);
+
+      while (1)
+       {
+         usleep (333333);
+         fprintf (stderr, ".");
+       }
+    }
+
+  return 0;
+}
diff --git a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp 
b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
index 26ac6fe..c843710 100644
--- a/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
+++ b/liboffloadmic/plugin/libgomp-plugin-intelmic.cpp
@@ -34,6 +34,7 @@
 #include <string.h>
 #include <utility>
 #include <vector>
+#include <set>
 #include <map>
 #include "libgomp-plugin.h"
 #include "compiler_if_host.h"
@@ -76,9 +77,15 @@ static int num_images;
    second key is number of device.  Contains a vector of pointer pairs.  */
 static ImgDevAddrMap *address_table;
 
+/* Set of asynchronously running target tasks.  */
+static std::set<const void *> *async_tasks;
+
 /* Thread-safe registration of the main image.  */
 static pthread_once_t main_image_is_registered = PTHREAD_ONCE_INIT;
 
+/* Mutex for protecting async_tasks.  */
+static pthread_mutex_t async_tasks_lock = PTHREAD_MUTEX_INITIALIZER;
+
 static VarDesc vd_host2tgt = {
   { 1, 1 },                  /* dst, src                             */
   { 1, 0 },                  /* in, out                              */
@@ -156,6 +163,8 @@ init (void)
 
 out:
   address_table = new ImgDevAddrMap;
+  async_tasks = new std::set<const void *>;
+  pthread_mutex_init (&async_tasks_lock, NULL);
   num_devices = _Offload_number_of_devices ();
 }
 
@@ -192,11 +201,27 @@ GOMP_OFFLOAD_get_num_devices (void)
 
 static void
 offload (const char *file, uint64_t line, int device, const char *name,
-        int num_vars, VarDesc *vars, VarDesc2 *vars2)
+        int num_vars, VarDesc *vars, VarDesc2 *vars2, const void *async_data)
 {
   OFFLOAD ofld = __offload_target_acquire1 (&device, file, line);
   if (ofld)
-    __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL, NULL);
+    {
+      if (async_data == NULL)
+       __offload_offload1 (ofld, name, 0, num_vars, vars, vars2, 0, NULL,
+                           NULL);
+      else
+       {
+         pthread_mutex_lock (&async_tasks_lock);
+         async_tasks->insert (async_data);
+         pthread_mutex_unlock (&async_tasks_lock);
+
+         OffloadFlags flags;
+         flags.flags = 0;
+         flags.bits.omp_async = 1;
+         __offload_offload3 (ofld, name, 0, num_vars, vars, NULL, 0, NULL,
+                             (const void **) async_data, 0, NULL, flags, NULL);
+       }
+    }
   else
     {
       fprintf (stderr, "%s:%d: Offload target acquire failed\n", file, line);
@@ -218,7 +243,7 @@ GOMP_OFFLOAD_init_device (int device)
   TRACE ("");
   pthread_once (&main_image_is_registered, register_main_image);
   offload (__FILE__, __LINE__, device, "__offload_target_init_proc", 0,
-          NULL, NULL);
+          NULL, NULL, NULL);
 }
 
 extern "C" void
@@ -240,7 +265,7 @@ get_target_table (int device, int &num_funcs, int 
&num_vars, void **&table)
   VarDesc2 vd1g[2] = { { "num_funcs", 0 }, { "num_vars", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_table_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   int table_size = num_funcs + 2 * num_vars;
   if (table_size > 0)
@@ -254,7 +279,7 @@ get_target_table (int device, int &num_funcs, int 
&num_vars, void **&table)
       VarDesc2 vd2g = { "table", 0 };
 
       offload (__FILE__, __LINE__, device, "__offload_target_table_p2", 1,
-              &vd2, &vd2g);
+              &vd2, &vd2g, NULL);
     }
 }
 
@@ -401,8 +426,8 @@ GOMP_OFFLOAD_alloc (int device, size_t size)
   vd1[1].size = sizeof (void *);
   VarDesc2 vd1g[2] = { { "size", 0 }, { "tgt_ptr", 0 } };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g);
-
+  offload (__FILE__, __LINE__, device, "__offload_target_alloc", 2, vd1, vd1g,
+          NULL);
   return tgt_ptr;
 }
 
@@ -416,7 +441,8 @@ GOMP_OFFLOAD_free (int device, void *tgt_ptr)
   vd1.size = sizeof (void *);
   VarDesc2 vd1g = { "tgt_ptr", 0 };
 
-  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, 
&vd1g);
+  offload (__FILE__, __LINE__, device, "__offload_target_free", 1, &vd1, &vd1g,
+          NULL);
 }
 
 extern "C" void *
@@ -435,7 +461,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const 
void *host_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_host2tgt;
   vd2.ptr = (void *) host_ptr;
@@ -443,7 +469,7 @@ GOMP_OFFLOAD_host2dev (int device, void *tgt_ptr, const 
void *host_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_host2tgt_p2", 1,
-          &vd2, &vd2g);
+          &vd2, &vd2g, NULL);
 
   return tgt_ptr;
 }
@@ -464,7 +490,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const 
void *tgt_ptr,
   VarDesc2 vd1g[2] = { { "tgt_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p1", 2,
-          vd1, vd1g);
+          vd1, vd1g, NULL);
 
   VarDesc vd2 = vd_tgt2host;
   vd2.ptr = (void *) host_ptr;
@@ -472,7 +498,7 @@ GOMP_OFFLOAD_dev2host (int device, void *host_ptr, const 
void *tgt_ptr,
   VarDesc2 vd2g = { "var", 0 };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2host_p2", 1,
-          &vd2, &vd2g);
+          &vd2, &vd2g, NULL);
 
   return host_ptr;
 }
@@ -495,22 +521,56 @@ GOMP_OFFLOAD_dev2dev (int device, void *dst_ptr, const 
void *src_ptr,
   VarDesc2 vd1g[3] = { { "dst_ptr", 0 }, { "src_ptr", 0 }, { "size", 0 } };
 
   offload (__FILE__, __LINE__, device, "__offload_target_tgt2tgt", 3, vd1,
-          vd1g);
+          vd1g, NULL);
 
   return dst_ptr;
 }
 
 extern "C" void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+                       const void *async_data)
+{
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p, async_data = %p)", device,
+        tgt_fn, tgt_vars, async_data);
+
+  VarDesc vd[2] = { vd_host2tgt, vd_host2tgt };
+  vd[0].ptr = &tgt_fn;
+  vd[0].size = sizeof (void *);
+  vd[1].ptr = &tgt_vars;
+  vd[1].size = sizeof (void *);
+
+  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd, NULL,
+          async_data);
+}
+
+extern "C" void
 GOMP_OFFLOAD_run (int device, void *tgt_fn, void *tgt_vars)
 {
-  TRACE ("(tgt_fn = %p, tgt_vars = %p)", tgt_fn, tgt_vars);
+  TRACE ("(device = %d, tgt_fn = %p, tgt_vars = %p)", device, tgt_fn, 
tgt_vars);
 
-  VarDesc vd1[2] = { vd_host2tgt, vd_host2tgt };
-  vd1[0].ptr = &tgt_fn;
-  vd1[0].size = sizeof (void *);
-  vd1[1].ptr = &tgt_vars;
-  vd1[1].size = sizeof (void *);
-  VarDesc2 vd1g[2] = { { "tgt_fn", 0 }, { "tgt_vars", 0 } };
+  GOMP_OFFLOAD_async_run (device, tgt_fn, tgt_vars, NULL);
+}
+
+extern "C" bool
+GOMP_OFFLOAD_async_is_completed (int device, const void *async_data)
+{
+  TRACE ("(device = %d, async_data = %p)", device, async_data);
+
+  bool res;
+  pthread_mutex_lock (&async_tasks_lock);
+  res = async_tasks->count (async_data) == 0;
+  pthread_mutex_unlock (&async_tasks_lock);
+  return res;
+}
+
+/* Called by liboffloadmic when asynchronous function is completed.  */
+
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *async_data)
+{
+  TRACE ("(async_data = %p)", async_data);
 
-  offload (__FILE__, __LINE__, device, "__offload_target_run", 2, vd1, vd1g);
+  pthread_mutex_lock (&async_tasks_lock);
+  async_tasks->erase (async_data);
+  pthread_mutex_unlock (&async_tasks_lock);
 }
diff --git a/liboffloadmic/runtime/offload_host.cpp 
b/liboffloadmic/runtime/offload_host.cpp
index 08f626f..8cee12c 100644
--- a/liboffloadmic/runtime/offload_host.cpp
+++ b/liboffloadmic/runtime/offload_host.cpp
@@ -64,6 +64,9 @@ static void __offload_fini_library(void);
 #define GET_OFFLOAD_NUMBER(timer_data) \
     timer_data? timer_data->offload_number : 0
 
+extern "C" void
+__gomp_offload_intelmic_async_completed (const void *);
+
 extern "C" {
 #ifdef TARGET_WINNT
 // Windows does not support imports from libraries without actually
@@ -2507,7 +2510,7 @@ extern "C" {
         const void *info
     )
     {
-       /* TODO: Call callback function, pass info.  */
+       __gomp_offload_intelmic_async_completed (info);
     }
 }
 

  -- Ilya

Reply via email to