On 12/30/2017 10:54 AM, Jakub Jelinek wrote:
On Fri, Dec 29, 2017 at 02:07:49PM +0100, Tom de Vries wrote:
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
#include "builtins.h"
#include "gomp-constants.h"
#include "debug.h"
+#include "omp-offload.h"
static void lto_write_tree (struct output_block*, tree, bool);
@@ -2355,6 +2356,31 @@ lto_output (void)
int i, n_nodes;
lto_symtab_encoder_t encoder = lto_get_out_decl_state
()->symtab_node_encoder;
+ bool truncated_p = false;
I don't think you need this var.
Removed.
+ unsigned int write_index = 0;
+ for (unsigned read_index = 0; read_index < vec_safe_length (offload_funcs);
+ read_index++)
+ {
+ tree fn_decl = (*offload_funcs)[read_index];
+ bool remove_p = cgraph_node::get (fn_decl) == NULL;
+ if (remove_p)
+ {
+ truncated_p = true;
+ continue;
+ }
+
+ if (write_index != read_index)
+ (*offload_funcs)[write_index] = (*offload_funcs)[read_index];
+
+ write_index++;
+ }
+ if (truncated_p)
+ offload_funcs->truncate (write_index);
Either you truncate unconditionally, truncate is extremely cheap operation,
or if you really wanted to guard it, you could just do
if (read_index != write_index)
My concern was not the cost, but offload_funcs == NULL.
I've fixed this now by moving the code into a separate function and
using the offload_funcs == NULL as early exit test.
+
+ if (!flag_lto)
+ for (unsigned i = 0; i < vec_safe_length (offload_funcs); i++)
+ DECL_PRESERVE_P ((*offload_funcs)[i]) = 1;
Can you please do this inside of the above loop, you have fn_decl already
there, just do it after the
if (remove_p)
continue;
Done.
And, I think you can do it unconditionally at that point,
Done. [ I wonder though if we can use in_lto_p as early exit test as well. ]
or, can you use
in_lto_p instead of flag_lto? flag_lto is set even during the -flto
compilation of the sources before LTO is streamed, there is no need to
pessimize that code, we can still remove it, we just can't remove anything
after we've streamed LTO bytecode (for either the host or offloading
targets).
@@ -7058,7 +7058,11 @@ expand_omp_target (struct omp_region *region)
/* Add the new function to the offload table. */
if (ENABLE_OFFLOADING)
- vec_safe_push (offload_funcs, child_fn);
+ {
+ if (flag_lto)
+ DECL_PRESERVE_P (child_fn) = 1;
And use if (in_lto_p) here too.
Done.
Ok for trunk with those changes.
Will commit after another round of testing.
Thanks,
- Tom
Prune removed funcs from offload table
2017-12-27 Tom de Vries <t...@codesourcery.com>
PR libgomp/83046
* omp-expand.c (expand_omp_target): If in_lto_p, mark offload_funcs with
DECL_PRESERVE_P.
* lto-streamer-out.c (prune_offload_funcs): New function. Remove
offload_funcs entries that no longer have a corresponding cgraph_node.
Mark the remaining ones as DECL_PRESERVE_P.
(output_lto): Call prune_offload_funcs.
* testsuite/libgomp.oacc-c-c++-common/pr83046.c: New test.
* testsuite/libgomp.c-c++-common/pr83046.c: New test.
---
gcc/lto-streamer-out.c | 32 ++++++++++++++++++++++
gcc/omp-expand.c | 6 +++-
libgomp/testsuite/libgomp.c-c++-common/pr83046.c | 25 +++++++++++++++++
.../testsuite/libgomp.oacc-c-c++-common/pr83046.c | 25 +++++++++++++++++
4 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/gcc/lto-streamer-out.c b/gcc/lto-streamer-out.c
index ba29bd088e6..ef170838fc0 100644
--- a/gcc/lto-streamer-out.c
+++ b/gcc/lto-streamer-out.c
@@ -41,6 +41,7 @@ along with GCC; see the file COPYING3. If not see
#include "builtins.h"
#include "gomp-constants.h"
#include "debug.h"
+#include "omp-offload.h"
static void lto_write_tree (struct output_block*, tree, bool);
@@ -2345,6 +2346,35 @@ wrap_refs (tree *tp, int *ws, void *)
return NULL_TREE;
}
+/* Remove functions that are no longer used from offload_funcs, and mark the
+ remaining ones with DECL_PRESERVE_P. */
+
+static void
+prune_offload_funcs (void)
+{
+ if (!offload_funcs)
+ return;
+
+ unsigned int write_index = 0;
+ for (unsigned read_index = 0; read_index < vec_safe_length (offload_funcs);
+ read_index++)
+ {
+ tree fn_decl = (*offload_funcs)[read_index];
+ bool remove_p = cgraph_node::get (fn_decl) == NULL;
+ if (remove_p)
+ continue;
+
+ DECL_PRESERVE_P (fn_decl) = 1;
+
+ if (write_index != read_index)
+ (*offload_funcs)[write_index] = (*offload_funcs)[read_index];
+
+ write_index++;
+ }
+
+ offload_funcs->truncate (write_index);
+}
+
/* Main entry point from the pass manager. */
void
@@ -2355,6 +2385,8 @@ lto_output (void)
int i, n_nodes;
lto_symtab_encoder_t encoder = lto_get_out_decl_state ()->symtab_node_encoder;
+ prune_offload_funcs ();
+
if (flag_checking)
output = lto_bitmap_alloc ();
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 02488339b40..663711b3aa4 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -7058,7 +7058,11 @@ expand_omp_target (struct omp_region *region)
/* Add the new function to the offload table. */
if (ENABLE_OFFLOADING)
- vec_safe_push (offload_funcs, child_fn);
+ {
+ if (in_lto_p)
+ DECL_PRESERVE_P (child_fn) = 1;
+ vec_safe_push (offload_funcs, child_fn);
+ }
bool need_asm = DECL_ASSEMBLER_NAME_SET_P (current_function_decl)
&& !DECL_ASSEMBLER_NAME_SET_P (child_fn);
diff --git a/libgomp/testsuite/libgomp.c-c++-common/pr83046.c b/libgomp/testsuite/libgomp.c-c++-common/pr83046.c
new file mode 100644
index 00000000000..90dcb704fb3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c-c++-common/pr83046.c
@@ -0,0 +1,25 @@
+/* { dg-do link } */
+
+#define N 100
+
+int
+main ()
+{
+ int a[N];
+ int i, x;
+ int c;
+
+ c = 1;
+#pragma omp target
+ for (i = 0; i < 100; i++)
+ a[i] = 0;
+
+ if (c)
+ __builtin_unreachable ();
+
+#pragma omp target
+ for (i = 0; i < 100; i++)
+ a[i] = 1;
+
+ return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83046.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83046.c
new file mode 100644
index 00000000000..a2a085c5fb2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/pr83046.c
@@ -0,0 +1,25 @@
+/* { dg-do link } */
+
+#define N 100
+
+int
+main ()
+{
+ int a[N];
+ int i, x;
+ int c;
+
+ c = 1;
+#pragma acc parallel loop
+ for (i = 0; i < 100; i++)
+ a[i] = 0;
+
+ if (c)
+ __builtin_unreachable ();
+
+#pragma acc parallel loop
+ for (i = 0; i < 100; i++)
+ a[i] = 1;
+
+ return 0;
+}