Hi!

On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <nat...@acm.org> wrote:
> acc_on_device and it's builtin had a conflict.  The function formally takes 
> an 
> enum argument, but the builtin takes an int -- primarily to avoid the 
> compiler 
> having to generate the enum  type internally.
> 
> This works fine for C,  where the external declaration of the function (in 
> openacc.h) matches up with the builtin, and we optimize the builtin as 
> expected.
> 
> It fails for C++ where the builtin doesn't match the declaration in the 
> header. 
>   We end up with emitting a call to acc_on_device,  which is resolved by 
> libgomp.  Unfortunately that means we fail to optimize.  [...]

> [Nathan's trunk r229562] leaves things unchanged for C --  declare a function 
> with an enum arg. 
>   But for C++ we the extern "C" declaration takes an int -- and therefore 
> matches the builtin.  We insert an inline wrapper that takes an enum 
> argument. 
> Because of C++'s overload resolution both the wrapper and the int-taking 
> declaration can have the same source name.

> --- libgomp/openacc.h (revision 229535)
> +++ libgomp/openacc.h (working copy)

> -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> +#ifdef __cplusplus
> +int acc_on_device (int __arg) __GOACC_NOTHROW;
> +#else
> +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> +#endif

>  #ifdef __cplusplus
>  }
> +
> +/* Forwarding function with correctly typed arg.  */
> +
> +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> +{
> +  return acc_on_device ((int) __arg);
> +}
>  #endif

> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c       
> (revision 0)
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c       
> (working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include <openacc.h>
> +
> +int Foo (acc_device_t x)
> +{
> +  return acc_on_device (x);
> +}
> +
> +/* { dg-final { scan-assembler-not "acc_on_device" } } */

As a user, I'd expect that when compiling such code with "-O0" instead of
"-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
then get "acc_on_device" expanded as a builtin, and no calls to the
"acc_on_device library function.  In C++ that is currently not working,
because the "Forwarding function with correctly typed arg" (cited above)
doesn't "inherit" that "optimize" attribute.  Making that one "always
inline" resolves the problem.  Also I cleaned up and extended testing
some more.  OK for trunk?

commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
Author: Thomas Schwinge <tho...@codesourcery.com>
Date:   Tue May 23 13:21:14 2017 +0200

    Make the OpenACC C++ acc_on_device wrapper "always inline"
    
            libgomp/
            * openacc.h [__cplusplus] (acc_on_device): Mark as "always
            inline".
            * testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
            file; test cases already present...
            * testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
            this file.  Update.
            * testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
            file; test cases now present...
            * testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
            this new file.
            * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
---
 libgomp/openacc.h                                  |  3 +-
 .../libgomp.oacc-c-c++-common/acc-on-device-2.c    | 22 -------------
 .../libgomp.oacc-c-c++-common/acc-on-device.c      | 12 -------
 .../libgomp.oacc-c-c++-common/acc_on_device-1.c    | 38 +++++++++++++---------
 .../libgomp.oacc-c-c++-common/acc_on_device-2.c    | 21 ++++++++++++
 .../libgomp.oacc-c-c++-common/parallel-dims.c      | 14 ++++----
 6 files changed, 52 insertions(+), 58 deletions(-)

diff --git libgomp/openacc.h libgomp/openacc.h
index 137e2c1..266f559 100644
--- libgomp/openacc.h
+++ libgomp/openacc.h
@@ -121,7 +121,8 @@ int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
 /* Forwarding function with correctly typed arg.  */
 
 #pragma acc routine seq
-inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
+inline __attribute__ ((__always_inline__)) int
+acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
 {
   return acc_on_device ((int) __arg);
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
deleted file mode 100644
index bfcb67d..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Test the acc_on_device library function. */
-/* { dg-additional-options "-fno-builtin-acc_on_device" } */
-
-#include <openacc.h>
-
-int main ()
-{
-  int dev;
-  
-#pragma acc parallel copyout (dev)
-  {
-    dev = acc_on_device (acc_device_not_host);
-  }
-
-  int expect = 1;
-  
-#if  ACC_DEVICE_TYPE_host
-  expect = 0;
-#endif
-  
-  return dev != expect;
-}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
deleted file mode 100644
index e0d8710..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* { dg-do compile } */
-/* We don't expect this to work with optimizations disabled.
-   { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
-#include <openacc.h>
-
-int Foo (acc_device_t x)
-{
-  return acc_on_device (x);
-}
-
-/* { dg-final { scan-assembler-not "acc_on_device" } } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
index 8112745..eb962e4 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
@@ -1,6 +1,9 @@
 /* Disable the acc_on_device builtin; we want to test the libgomp library
    function.  */
+/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
 /* { dg-additional-options "-fno-builtin-acc_on_device" } */
+/* { dg-additional-options "-fdump-rtl-expand" }
+   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12 
"expand" } } */
 
 #include <stdlib.h>
 #include <openacc.h>
@@ -11,13 +14,13 @@ main (int argc, char *argv[])
   /* Host.  */
 
   {
-    if (!acc_on_device (acc_device_none))
+    if (!ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (!acc_on_device (acc_device_host))
+    if (!ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (acc_on_device (acc_device_not_host))
+    if (ACC_ON_DEVICE (acc_device_not_host))
       abort ();
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
   }
 
@@ -26,39 +29,44 @@ main (int argc, char *argv[])
 
 #pragma acc parallel if(0)
   {
-    if (!acc_on_device (acc_device_none))
+    if (!ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (!acc_on_device (acc_device_host))
+    if (!ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (acc_on_device (acc_device_not_host))
+    if (ACC_ON_DEVICE (acc_device_not_host))
       abort ();
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
   }
 
 
-#if !ACC_DEVICE_TYPE_host
+  int on_host_p;
+#if ACC_DEVICE_TYPE_host
+  on_host_p = 1;
+#else
+  on_host_p = 0;
+#endif
 
   /* Offloaded.  */
 
 #pragma acc parallel
   {
-    if (acc_on_device (acc_device_none))
+    if (on_host_p != ACC_ON_DEVICE (acc_device_none))
       abort ();
-    if (acc_on_device (acc_device_host))
+    if (on_host_p != ACC_ON_DEVICE (acc_device_host))
       abort ();
-    if (!acc_on_device (acc_device_not_host))
+    if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
       abort ();
+
 #if ACC_DEVICE_TYPE_nvidia
-    if (!acc_on_device (acc_device_nvidia))
+    if (!ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
 #else
-    if (acc_on_device (acc_device_nvidia))
+    if (ACC_ON_DEVICE (acc_device_nvidia))
       abort ();
 #endif
   }
 
-#endif
 
   return 0;
 }
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
new file mode 100644
index 0000000..c3b3378
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
@@ -0,0 +1,21 @@
+/* With the acc_on_device builtin enabled, we don't expect any calls to the
+   libgomp library function.  */
+/* { dg-additional-options "-fdump-rtl-expand" }
+   { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0 
"expand" } } */
+
+#include <openacc.h>
+
+#ifdef __OPTIMIZE__
+# define ACC_ON_DEVICE acc_on_device
+#else
+/* Without optimizations enabled, we're not expecting the acc_on_device builtin
+   to be used, so use here a "-O2" wrapper.  */
+#pragma acc routine seq
+static int __attribute__ ((optimize ("O2")))
+ACC_ON_DEVICE (acc_device_t arg)
+{
+  return acc_on_device (arg);
+}
+#endif
+
+#include "acc_on_device-1.c"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 8308f7c..1c48ab3 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -4,14 +4,12 @@
 #include <limits.h>
 #include <openacc.h>
 
-/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
-   not behaving as expected for -O0.  */
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
@@ -24,9 +22,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) 
acc_gang ()
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
@@ -39,9 +37,9 @@ static unsigned int __attribute__ ((optimize ("O2"))) 
acc_worker ()
 #pragma acc routine seq
 static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
 {
-  if (acc_on_device ((int) acc_device_host))
+  if (acc_on_device (acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device (acc_device_nvidia))
     {
       unsigned int r;
       asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));


Grüße
 Thomas

Reply via email to