On 09/25/15 06:28, Bernd Schmidt wrote:
This is the c-c++-common/goacc/acc_on_device-2.c testcase. Is that expected to
be handled? If I change it to use __builtin_acc_on_device, I can step right into
Breakpoint 8, fold_call_stmt (stmt=0x7ffff0736e10, ignore=false) at
../../git/gcc/builtins.c:12277
12277 tree ret = NULL_TREE;
Maybe you were compiling without optimization? In that case
expand_builtin_acc_on_device (which already exists) should still end up doing
the right thing. In no case should you see a RTL call to a function, that
indicates that something else went wrong.
I think I was reading more into the std than it intended, as it claims
on_deveice should evaluate 'to a constant'. (no mention of 'when optimizing').
It can't mean 'be useable in integral-constant-expression, as at the point we
need those, one doesn't know the value it should be.
thinking about it, I don't think a user can tell. the case I had in mind (and
have used it for), is something like
on_device (nvidia) ? asm ("NVIDIA specific asm") : c-expr
and for that to work, one must turn the optimzer on to get the dead code
removal, regardless of where on_device expands. So my goal of getting it
expanded regardless of optimization level is not needed --- indeed getting it
expanded in fold_call_stmt will mean the body of expand_on_device can go away (I
think).
From the POV of what the programmer really cares about is that when optimizing
the compiler knows how to fold it.
Can you send me the patch you tried (and possibly a testcase you expect to be
handled), I'll see if I can find out what's going on.
Thanks! When things didn't work, I tried getting it workong on the gomp4
branch, as I new what to expect there. So the patch is for that branch.
The fails I observed are:
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/if-1.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/gang-static-2.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O0
execution test
FAIL: libgomp.oacc-c/../libgomp.oacc-c-c++-common/gang-static-2.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2
execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/if-1.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/gang-static-2.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O0
execution test
FAIL: libgomp.oacc-c++/../libgomp.oacc-c-c++-common/gang-static-2.c
-DACC_DEVICE_TYPE_nvidia=1 -DACC_MEM_SHARED=0 -foffload=nvptx-none -O2
execution test
the diff I have is attached -- as you can see it's 'experimental'.
nathan
Index: builtins.c
===================================================================
--- builtins.c (revision 228094)
+++ builtins.c (working copy)
@@ -5866,6 +5866,8 @@ expand_stack_save (void)
static rtx
expand_builtin_acc_on_device (tree exp, rtx target)
{
+ gcc_unreachable ();
+
#ifndef ACCEL_COMPILER
gcc_assert (!get_oacc_fn_attrib (current_function_decl));
#endif
@@ -10272,6 +10274,27 @@ fold_builtin_1 (location_t loc, tree fnd
return build_empty_stmt (loc);
break;
+ case BUILT_IN_ACC_ON_DEVICE:
+ /* Don't fold on_device until we know which compiler is active. */
+ if (symtab->state == EXPANSION)
+ {
+ unsigned val_host = GOMP_DEVICE_HOST;
+ unsigned val_dev = GOMP_DEVICE_NONE;
+
+#ifdef ACCEL_COMPILER
+ val_host = GOMP_DEVICE_NOT_HOST;
+ val_dev = ACCEL_COMPILER_acc_device;
+#endif
+ tree host = build2 (EQ_EXPR, boolean_type_node, arg0,
+ build_int_cst (integer_type_node, val_host));
+ tree dev = build2 (EQ_EXPR, boolean_type_node, arg0,
+ build_int_cst (integer_type_node, val_dev));
+
+ tree result = build2 (TRUTH_OR_EXPR, boolean_type_node, host, dev);
+ return fold_convert (integer_type_node, result);
+ }
+ break;
+
default:
break;
}
Index: omp-low.c
===================================================================
--- omp-low.c (revision 228094)
+++ omp-low.c (working copy)
@@ -14725,21 +14725,20 @@ static void
oacc_xform_on_device (gcall *call)
{
tree arg = gimple_call_arg (call, 0);
- unsigned val = GOMP_DEVICE_HOST;
-
-#ifdef ACCEL_COMPILER
- val = GOMP_DEVICE_NOT_HOST;
-#endif
- tree result = build2 (EQ_EXPR, boolean_type_node, arg,
- build_int_cst (integer_type_node, val));
+ unsigned val_host = GOMP_DEVICE_HOST;
+ unsigned val_dev = GOMP_DEVICE_NONE;
+
#ifdef ACCEL_COMPILER
- {
- tree dev = build2 (EQ_EXPR, boolean_type_node, arg,
- build_int_cst (integer_type_node,
- ACCEL_COMPILER_acc_device));
- result = build2 (TRUTH_OR_EXPR, boolean_type_node, result, dev);
- }
+ val_host = GOMP_DEVICE_NOT_HOST;
+ val_dev = ACCEL_COMPILER_acc_device;
#endif
+
+ tree host = build2 (EQ_EXPR, boolean_type_node, arg,
+ build_int_cst (integer_type_node, val_host));
+ tree dev = build2 (EQ_EXPR, boolean_type_node, arg,
+ build_int_cst (integer_type_node, val_dev));
+
+ tree result = build2 (TRUTH_OR_EXPR, boolean_type_node, host, dev);
result = fold_convert (integer_type_node, result);
tree lhs = gimple_call_lhs (call);
gimple_seq seq = NULL;
@@ -14879,7 +14878,7 @@ execute_oacc_transform ()
gcall *call = as_a <gcall *> (stmt);
- if (gimple_call_builtin_p (call, BUILT_IN_ACC_ON_DEVICE))
+ if (0 && gimple_call_builtin_p (call, BUILT_IN_ACC_ON_DEVICE))
/* acc_on_device must be evaluated at compile time for
constant arguments. */
{