Hi Kwok!

On 2020-07-13T16:29:14+0100, Kwok Cheung Yeung <k...@codesourcery.com> wrote:
> When the version of nvprof in CUDA 9.0 is run on an OpenACC program, [...] the
> program deadlocks.

> I have added a testcase that sets up the situation presented by nvprof.

Thanks.  I have extended this one a little bit, to add some state
tracking to verify that we get the expected callbacks invoked, test what
we expect returned from 'acc_get_device_type', and in addition to your
'acc_ev_device_init_start' also verify the corresponding
'acc_ev_device_init_end'.

I've also updated the documentation.

> This
> testcase hangs without the patch (hence the short dg-timeout), and passes with
> it.

(Thus, 'dg-timeout' not really necessary anymore, but OK to leave in if
you'd like.)

> Okay for master, GCC 10 branch and OG10?

Thanks, OK, with the incremental patch merged in, unless there's anything
to discuss further.

>     libgomp: Fix hang when profiling OpenACC programs with CUDA 9.0 nvprof
>
>     The version of nvprof in CUDA 9.0 causes a hang when used to profile an
>     OpenACC program.  This is because it calls acc_get_device_type from
>     a callback called during device initialization, which then attempts
>     to acquire acc_device_lock while it is already taken, resulting in
>     deadlock.  This works around the issue by returning acc_device_none
>     from acc_get_device_type without attempting to acquire the lock when
>     initialization has not completed yet.
>
>     2020-07-13  Tom de Vries  <tdevr...@suse.de>

Should use Tom's CodeSourcery address, given that's when this work was
done.

>           Cesar Philippidis  <ce...@codesourcery.com>
>           Thomas Schwinge  <tho...@codesourcery.com>
>           Kwok Cheung Yeung  <k...@codesourcery.com>


Grüße
 Thomas


>       libgomp/
>       * oacc-init.c (acc_init_state_lock, acc_init_state, acc_init_thread):
>       New variable.
>       (acc_init_1): Set acc_init_thread to pthread_self ().  Set
>       acc_init_state to initializing at the start, and to initialized at the
>       end.
>       (self_initializing_p): New function.
>       (acc_get_device_type): Return acc_device_none if called by thread that
>       is currently executing acc_init_1.
>       * testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c: New.
>
> diff --git a/libgomp/oacc-init.c b/libgomp/oacc-init.c
> index 5d786a5..1e7f934 100644
> --- a/libgomp/oacc-init.c
> +++ b/libgomp/oacc-init.c
> @@ -40,6 +40,11 @@
>
>  static gomp_mutex_t acc_device_lock;
>
> +static gomp_mutex_t acc_init_state_lock;
> +static enum { uninitialized, initializing, initialized } acc_init_state
> +  = uninitialized;
> +static pthread_t acc_init_thread;
> +
>  /* A cached version of the dispatcher for the global "current" accelerator 
> type,
>     e.g. used as the default when creating new host threads.  This is the
>     device-type equivalent of goacc_device_num (which specifies which device 
> to
> @@ -228,6 +233,11 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int 
> ndevs)
>  static struct gomp_device_descr *
>  acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
>  {
> +  gomp_mutex_lock (&acc_init_state_lock);
> +  acc_init_state = initializing;
> +  acc_init_thread = pthread_self ();
> +  gomp_mutex_unlock (&acc_init_state_lock);
> +
>    bool check_not_nested_p;
>    if (implicit)
>      {
> @@ -317,6 +327,14 @@ acc_init_1 (acc_device_t d, acc_construct_t 
> parent_construct, int implicit)
>                               &api_info);
>      }
>
> +  /* We're setting 'initialized' *after* 'goacc_profiling_dispatch', so that 
> a
> +     nested 'acc_get_device_type' called from a profiling callback still sees
> +     'initializing', so that we don't deadlock when it then again tries to 
> lock
> +     'goacc_prof_lock'.  See also the discussion in 'acc_get_device_type'.  
> */
> +  gomp_mutex_lock (&acc_init_state_lock);
> +  acc_init_state = initialized;
> +  gomp_mutex_unlock (&acc_init_state_lock);
> +
>    return base_dev;
>  }
>
> @@ -643,6 +661,17 @@ acc_set_device_type (acc_device_t d)
>
>  ialias (acc_set_device_type)
>
> +static bool
> +self_initializing_p (void)
> +{
> +  bool res;
> +  gomp_mutex_lock (&acc_init_state_lock);
> +  res = (acc_init_state == initializing
> +      && pthread_equal (acc_init_thread, pthread_self ()));
> +  gomp_mutex_unlock (&acc_init_state_lock);
> +  return res;
> +}
> +
>  acc_device_t
>  acc_get_device_type (void)
>  {
> @@ -652,6 +681,15 @@ acc_get_device_type (void)
>
>    if (thr && thr->base_dev)
>      res = acc_device_type (thr->base_dev->type);
> +  else if (self_initializing_p ())
> +    /* The Cuda libaccinj64.so version 9.0+ calls acc_get_device_type during 
> the
> +       acc_ev_device_init_start event callback, which is dispatched during
> +       acc_init_1.  Trying to lock acc_device_lock during such a call (as we 
> do
> +       in the else clause below), will result in deadlock, since the lock has
> +       already been taken by the acc_init_1 caller.  We work around this 
> problem
> +       by using the acc_get_device_type property "If the device type has not 
> yet
> +       been selected, the value acc_device_none may be returned".  */
> +    ;
>    else
>      {
>        acc_prof_info prof_info;
> diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c 
> b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
> new file mode 100644
> index 0000000..6114164
> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
> @@ -0,0 +1,39 @@
> +/* { dg-do run } */
> +/* { dg-timeout 10 } */
> +
> +/* Test the calling of acc_get_device_type() from within the 
> device_init_start
> +   callback.  This occurs when the CUDA 9.0 nvprof tool is used, and can
> +   deadlock if not handled properly.  */
> +
> +#include <acc_prof.h>
> +
> +static acc_prof_reg reg;
> +static acc_prof_reg unreg;
> +static acc_prof_lookup_func lookup;
> +
> +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
> acc_prof_lookup_func lookup_)
> +{
> +  reg = reg_;
> +  unreg = unreg_;
> +  lookup = lookup_;
> +}
> +
> +static acc_device_t acc_device_type;
> +
> +static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info 
> *event_info, acc_api_info *api_info)
> +{
> +  acc_device_type = acc_get_device_type ();
> +}
> +
> +int main(void)
> +{
> +  acc_register_library (acc_prof_register, acc_prof_unregister, 
> acc_prof_lookup);
> +
> +  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
> +
> +  acc_init (acc_device_host);
> +  acc_shutdown (acc_device_host);
> +
> +  acc_init (acc_device_default);
> +  acc_shutdown (acc_device_default);
> +}


-----------------
Mentor Graphics (Deutschland) GmbH, Arnulfstraße 201, 80634 München / Germany
Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Alexander 
Walter
>From 82e43a3263068e006dc96f9bf0ace033e45038ef Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tho...@codesourcery.com>
Date: Tue, 14 Jul 2020 12:43:53 +0200
Subject: [PATCH] into "libgomp: Fix hang when profiling OpenACC programs with
 CUDA 9.0 nvprof"

---
 libgomp/libgomp.texi                          | 11 +++
 .../acc_prof-cb-call.c                        | 39 ---------
 .../acc_prof-init-2.c                         | 80 +++++++++++++++++++
 3 files changed, 91 insertions(+), 39 deletions(-)
 delete mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c

diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi
index b946743f9b1..5331230c207 100644
--- a/libgomp/libgomp.texi
+++ b/libgomp/libgomp.texi
@@ -1967,6 +1967,12 @@ in @var{devicetype}, to use when executing a parallel or kernels region.
 This function returns what device type will be used when executing a
 parallel or kernels region.
 
+This function returns @code{acc_device_none} if
+@code{acc_get_device_type} is called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks of the OpenACC Profiling Interface (@ref{OpenACC Profiling
+Interface}), that is, if the device is currently being initialized.
+
 @item @emph{C/C++}:
 @multitable @columnfractions .20 .80
 @item @emph{Prototype}: @tab @code{acc_device_t acc_get_device_type(void);}
@@ -3382,6 +3388,11 @@ every event that has been registered.
 
 We're not yet accounting for the fact that @cite{OpenACC events may
 occur during event processing}.
+We just handle one case specially, as required by CUDA 9.0
+@command{nvprof}, that @code{acc_get_device_type}
+(@ref{acc_get_device_type})) may be called from
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+callbacks.
 
 We're not yet implementing initialization via a
 @code{acc_register_library} function that is either statically linked
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
deleted file mode 100644
index 6114164aa24..00000000000
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-cb-call.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/* { dg-do run } */
-/* { dg-timeout 10 } */
-
-/* Test the calling of acc_get_device_type() from within the device_init_start
-   callback.  This occurs when the CUDA 9.0 nvprof tool is used, and can
-   deadlock if not handled properly.  */
-
-#include <acc_prof.h>
-
-static acc_prof_reg reg;
-static acc_prof_reg unreg;
-static acc_prof_lookup_func lookup;
-
-void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
-{
-  reg = reg_;
-  unreg = unreg_;
-  lookup = lookup_;
-}
-
-static acc_device_t acc_device_type;
-
-static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
-{
-  acc_device_type = acc_get_device_type ();
-}
-
-int main(void)
-{
-  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
-
-  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
-
-  acc_init (acc_device_host);
-  acc_shutdown (acc_device_host);
-
-  acc_init (acc_device_default);
-  acc_shutdown (acc_device_default);
-}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c
new file mode 100644
index 00000000000..fab595cd463
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-2.c
@@ -0,0 +1,80 @@
+/* { dg-do run } */
+/* { dg-timeout 10 } */
+
+/* Test the calling of 'acc_get_device_type' from within
+   'cb_device_init_start', 'cb_device_init_end' callbacks.  This occurs when
+   the CUDA 9.0 'nvprof' tool is used, and did deadlock.  */
+
+#include <assert.h>
+#include <stdbool.h>
+#include <acc_prof.h>
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_)
+{
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+static bool expect_cb_device_init_start;
+static bool expect_cb_device_init_end;
+
+static void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  assert (expect_cb_device_init_start);
+  expect_cb_device_init_start = false;
+
+  acc_device_t acc_device_type;
+  acc_device_type = acc_get_device_type ();
+  assert (acc_device_type == acc_device_none);
+
+  expect_cb_device_init_end = true;
+}
+
+static void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info)
+{
+  assert (expect_cb_device_init_end);
+  expect_cb_device_init_end = false;
+
+  acc_device_t acc_device_type;
+  acc_device_type = acc_get_device_type ();
+  assert (acc_device_type == acc_device_none);
+}
+
+int main(void)
+{
+  acc_register_library (acc_prof_register, acc_prof_unregister, acc_prof_lookup);
+
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+
+  expect_cb_device_init_start = true;
+  expect_cb_device_init_end = false;
+  acc_init (acc_device_host);
+  assert (!expect_cb_device_init_start);
+  assert (!expect_cb_device_init_end);
+  {
+    acc_device_t acc_device_type;
+    acc_device_type = acc_get_device_type ();
+    assert (acc_device_type == acc_device_host);
+  }
+  acc_shutdown (acc_device_host);
+
+  expect_cb_device_init_start = true;
+  expect_cb_device_init_end = false;
+  acc_init (acc_device_default);
+  assert (!expect_cb_device_init_start);
+  assert (!expect_cb_device_init_end);
+  {
+    acc_device_t acc_device_type;
+    acc_device_type = acc_get_device_type ();
+    assert (acc_device_type != acc_device_none);
+  }
+  acc_shutdown (acc_device_default);
+
+  return 0;
+}
-- 
2.17.1

Reply via email to