[PATCH 7/7 libgomp,amdgcn] GCN Libgomp Plugin

Andrew Stubbs Tue, 12 Nov 2019 05:32:00 -0800

This patch contributes the GCN libgomp plugin, with the various
configure and make bits to go with it.


This implementation is a much-cleaned-up version of the one present on the
openacc-gcc-9-branch.

OK to commit?

Thanks

Andrew


2019-11-12  Andrew Stubbs  <a...@codesourcery.com>

        libgomp/
        * plugin/Makefrag.am: Add amdgcn plugin support.
        * plugin/configfrag.ac: Likewise.
        * plugin/plugin-gcn.c: New file.
        * configure: Regenerate.
---
 libgomp/plugin/Makefrag.am   |   14 +
 libgomp/plugin/configfrag.ac |   35 +
 libgomp/plugin/plugin-gcn.c  | 3985 ++++++++++++++++++++++++++++++++++
 3 files changed, 4034 insertions(+)
 create mode 100644 libgomp/plugin/plugin-gcn.c

diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am
index 168ef59de41..45ed043e333 100644
--- a/libgomp/plugin/Makefrag.am
+++ b/libgomp/plugin/Makefrag.am
@@ -52,3 +52,17 @@ libgomp_plugin_hsa_la_LDFLAGS += $(PLUGIN_HSA_LDFLAGS)
 libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
 endif
+
+if PLUGIN_GCN
+# AMD GCN plugin
+libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION)
+toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la
+libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c
+libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) $(PLUGIN_GCN_CPPFLAGS) \
+	-D_GNU_SOURCE
+libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \
+	$(lt_host_flags)
+libgomp_plugin_gcn_la_LDFLAGS += $(PLUGIN_GCN_LDFLAGS)
+libgomp_plugin_gcn_la_LIBADD = libgomp.la $(PLUGIN_GCN_LIBS)
+libgomp_plugin_gcn_la_LIBTOOLFLAGS = --tag=disable-static
+endif
diff --git a/libgomp/plugin/configfrag.ac b/libgomp/plugin/configfrag.ac
index 9718ac752e2..424ec6c96b2 100644
--- a/libgomp/plugin/configfrag.ac
+++ b/libgomp/plugin/configfrag.ac
@@ -137,6 +137,15 @@ AC_SUBST(PLUGIN_HSA_CPPFLAGS)
 AC_SUBST(PLUGIN_HSA_LDFLAGS)
 AC_SUBST(PLUGIN_HSA_LIBS)
 
+PLUGIN_GCN=0
+PLUGIN_GCN_CPPFLAGS=
+PLUGIN_GCN_LDFLAGS=
+PLUGIN_GCN_LIBS=
+AC_SUBST(PLUGIN_GCN)
+AC_SUBST(PLUGIN_GCN_CPPFLAGS)
+AC_SUBST(PLUGIN_GCN_LDFLAGS)
+AC_SUBST(PLUGIN_GCN_LIBS)
+
 # Parse '--enable-offload-targets', figure out the corresponding libgomp
 # plugins, and configure to find the corresponding offload compilers.
 # 'offload_plugins' and 'offload_targets' will be populated in the same order.
@@ -237,6 +246,29 @@ if test x"$enable_offload_targets" != x; then
             ;;
         esac
         ;;
+
+      amdgcn*)
+	case "${target}" in
+	  x86_64-*-*)
+	    case " ${CC} ${CFLAGS} " in
+	      *" -m32 "*)
+		PLUGIN_GCN=0
+		;;
+	      *)
+		tgt_plugin=gcn
+		PLUGIN_GCN=$tgt
+		PLUGIN_GCN_CPPFLAGS=$HSA_RUNTIME_CPPFLAGS
+		PLUGIN_GCN_LDFLAGS="$HSA_RUNTIME_LDFLAGS"
+		PLUGIN_GCN_LIBS="-ldl"
+		PLUGIN_GCN=1
+		;;
+	      esac
+	    ;;
+	  *-*-*)
+	    PLUGIN_GCN=0
+	     ;;
+	esac
+	;;
       *)
 	AC_MSG_ERROR([unknown offload target specified])
 	;;
@@ -275,6 +307,9 @@ AC_DEFINE_UNQUOTED([PLUGIN_NVPTX_DYNAMIC], [$PLUGIN_NVPTX_DYNAMIC],
 AM_CONDITIONAL([PLUGIN_HSA], [test $PLUGIN_HSA = 1])
 AC_DEFINE_UNQUOTED([PLUGIN_HSA], [$PLUGIN_HSA],
   [Define to 1 if the HSA plugin is built, 0 if not.])
+AM_CONDITIONAL([PLUGIN_GCN], [test $PLUGIN_GCN = 1])
+AC_DEFINE_UNQUOTED([PLUGIN_GCN], [$PLUGIN_GCN],
+  [Define to 1 if the GCN plugin is built, 0 if not.])
 
 if test "$HSA_RUNTIME_LIB" != ""; then
   HSA_RUNTIME_LIB="$HSA_RUNTIME_LIB/"
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
new file mode 100644
index 00000000000..583916759a5
--- /dev/null
+++ b/libgomp/plugin/plugin-gcn.c
@@ -0,0 +1,3985 @@
+/* Plugin for AMD GCN execution.
+
+   Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* {{{ Includes and defines  */
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <hsa.h>
+#include <dlfcn.h>
+#include <signal.h>
+#include "libgomp-plugin.h"
+#include "gomp-constants.h"
+#include <elf.h>
+#include "oacc-plugin.h"
+#include "oacc-int.h"
+#include <assert.h>
+
+/* Additional definitions not in HSA 1.1.
+   FIXME: this needs to be updated in hsa.h for upstream, but the only source
+          right now is the ROCr source which may cause license issues.  */
+#define HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT 0xA002
+
+/* These probably won't be in elf.h for a while.  */
+#define R_AMDGPU_NONE		0
+#define R_AMDGPU_ABS32_LO	1	/* (S + A) & 0xFFFFFFFF  */
+#define R_AMDGPU_ABS32_HI	2	/* (S + A) >> 32  */
+#define R_AMDGPU_ABS64		3	/* S + A  */
+#define R_AMDGPU_REL32		4	/* S + A - P  */
+#define R_AMDGPU_REL64		5	/* S + A - P  */
+#define R_AMDGPU_ABS32		6	/* S + A  */
+#define R_AMDGPU_GOTPCREL	7	/* G + GOT + A - P  */
+#define R_AMDGPU_GOTPCREL32_LO	8	/* (G + GOT + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_GOTPCREL32_HI	9	/* (G + GOT + A - P) >> 32  */
+#define R_AMDGPU_REL32_LO	10	/* (S + A - P) & 0xFFFFFFFF  */
+#define R_AMDGPU_REL32_HI	11	/* (S + A - P) >> 32  */
+#define reserved		12
+#define R_AMDGPU_RELATIVE64	13	/* B + A  */
+
+/* GCN specific definitions for asynchronous queues.  */
+
+#define ASYNC_QUEUE_SIZE 64
+#define DRAIN_QUEUE_SYNCHRONOUS_P false
+#define DEBUG_QUEUES 0
+#define DEBUG_THREAD_SLEEP 0
+#define DEBUG_THREAD_SIGNAL 0
+
+/* Defaults.  */
+#define DEFAULT_GCN_HEAP_SIZE (100*1024*1024)  /* 100MB.  */
+
+/* Secure getenv() which returns NULL if running as SUID/SGID.  */
+#ifndef HAVE_SECURE_GETENV
+#ifdef HAVE___SECURE_GETENV
+#define secure_getenv __secure_getenv
+#elif defined (HAVE_UNISTD_H) && defined(HAVE_GETUID) && defined(HAVE_GETEUID) \
+  && defined(HAVE_GETGID) && defined(HAVE_GETEGID)
+
+#include <unistd.h>
+
+/* Implementation of secure_getenv() for targets where it is not provided but
+   we have at least means to test real and effective IDs. */
+
+static char *
+secure_getenv (const char *name)
+{
+  if ((getuid () == geteuid ()) && (getgid () == getegid ()))
+    return getenv (name);
+  else
+    return NULL;
+}
+
+#else
+#define secure_getenv getenv
+#endif
+#endif
+
+/* }}}  */
+/* {{{ Types  */
+
+/* GCN-specific implmentation of the GOMP_PLUGIN_acc_thread data.  */
+
+struct gcn_thread
+{
+  /* The thread number from the async clause, or GOMP_ASYNC_SYNC.  */
+  int async;
+};
+
+/* As an HSA runtime is dlopened, following structure defines function
+   pointers utilized by the HSA plug-in.  */
+
+struct hsa_runtime_fn_info
+{
+  /* HSA runtime.  */
+  hsa_status_t (*hsa_status_string_fn) (hsa_status_t status,
+					const char **status_string);
+  hsa_status_t (*hsa_system_get_info_fn) (hsa_system_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_agent_get_info_fn) (hsa_agent_t agent,
+					 hsa_agent_info_t attribute,
+					 void *value);
+  hsa_status_t (*hsa_isa_get_info_fn)(hsa_isa_t isa,
+				      hsa_isa_info_t attribute,
+				      uint32_t index,
+				      void *value);
+  hsa_status_t (*hsa_init_fn) (void);
+  hsa_status_t (*hsa_iterate_agents_fn)
+    (hsa_status_t (*callback)(hsa_agent_t agent, void *data), void *data);
+  hsa_status_t (*hsa_region_get_info_fn) (hsa_region_t region,
+					  hsa_region_info_t attribute,
+					  void *value);
+  hsa_status_t (*hsa_queue_create_fn)
+    (hsa_agent_t agent, uint32_t size, hsa_queue_type_t type,
+     void (*callback)(hsa_status_t status, hsa_queue_t *source, void *data),
+     void *data, uint32_t private_segment_size,
+     uint32_t group_segment_size, hsa_queue_t **queue);
+  hsa_status_t (*hsa_agent_iterate_regions_fn)
+    (hsa_agent_t agent,
+     hsa_status_t (*callback)(hsa_region_t region, void *data), void *data);
+  hsa_status_t (*hsa_executable_destroy_fn) (hsa_executable_t executable);
+  hsa_status_t (*hsa_executable_create_fn)
+    (hsa_profile_t profile, hsa_executable_state_t executable_state,
+     const char *options, hsa_executable_t *executable);
+  hsa_status_t (*hsa_executable_global_variable_define_fn)
+    (hsa_executable_t executable, const char *variable_name, void *address);
+  hsa_status_t (*hsa_executable_load_code_object_fn)
+    (hsa_executable_t executable, hsa_agent_t agent,
+     hsa_code_object_t code_object, const char *options);
+  hsa_status_t (*hsa_executable_freeze_fn)(hsa_executable_t executable,
+					   const char *options);
+  hsa_status_t (*hsa_signal_create_fn) (hsa_signal_value_t initial_value,
+					uint32_t num_consumers,
+					const hsa_agent_t *consumers,
+					hsa_signal_t *signal);
+  hsa_status_t (*hsa_memory_allocate_fn) (hsa_region_t region, size_t size,
+					  void **ptr);
+  hsa_status_t (*hsa_memory_assign_agent_fn) (void *ptr, hsa_agent_t agent,
+					      hsa_access_permission_t access);
+  hsa_status_t (*hsa_memory_copy_fn)(void *dst, const void *src, size_t size);
+  hsa_status_t (*hsa_memory_free_fn) (void *ptr);
+  hsa_status_t (*hsa_signal_destroy_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_executable_get_symbol_fn)
+    (hsa_executable_t executable, const char *module_name,
+     const char *symbol_name, hsa_agent_t agent, int32_t call_convention,
+     hsa_executable_symbol_t *symbol);
+  hsa_status_t (*hsa_executable_symbol_get_info_fn)
+    (hsa_executable_symbol_t executable_symbol,
+     hsa_executable_symbol_info_t attribute, void *value);
+  hsa_status_t (*hsa_executable_iterate_symbols_fn)
+    (hsa_executable_t executable,
+     hsa_status_t (*callback)(hsa_executable_t executable,
+			      hsa_executable_symbol_t symbol, void *data),
+     void *data);
+  uint64_t (*hsa_queue_add_write_index_release_fn) (const hsa_queue_t *queue,
+						    uint64_t value);
+  uint64_t (*hsa_queue_load_read_index_acquire_fn) (const hsa_queue_t *queue);
+  void (*hsa_signal_store_relaxed_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  void (*hsa_signal_store_release_fn) (hsa_signal_t signal,
+				       hsa_signal_value_t value);
+  hsa_signal_value_t (*hsa_signal_wait_acquire_fn)
+    (hsa_signal_t signal, hsa_signal_condition_t condition,
+     hsa_signal_value_t compare_value, uint64_t timeout_hint,
+     hsa_wait_state_t wait_state_hint);
+  hsa_signal_value_t (*hsa_signal_load_acquire_fn) (hsa_signal_t signal);
+  hsa_status_t (*hsa_queue_destroy_fn) (hsa_queue_t *queue);
+
+  hsa_status_t (*hsa_code_object_deserialize_fn)
+    (void *serialized_code_object, size_t serialized_code_object_size,
+     const char *options, hsa_code_object_t *code_object);
+};
+
+/* Structure describing the run-time and grid properties of an HSA kernel
+   lauch.  This needs to match the format passed to GOMP_OFFLOAD_run.  */
+
+struct GOMP_kernel_launch_attributes
+{
+  /* Number of dimensions the workload has.  Maximum number is 3.  */
+  uint32_t ndim;
+  /* Size of the grid in the three respective dimensions.  */
+  uint32_t gdims[3];
+  /* Size of work-groups in the respective dimensions.  */
+  uint32_t wdims[3];
+};
+
+/* Collection of information needed for a dispatch of a kernel from a
+   kernel.  */
+
+struct kernel_dispatch
+{
+  struct agent_info *agent;
+  /* Pointer to a command queue associated with a kernel dispatch agent.  */
+  void *queue;
+  /* Pointer to a memory space used for kernel arguments passing.  */
+  void *kernarg_address;
+  /* Kernel object.  */
+  uint64_t object;
+  /* Synchronization signal used for dispatch synchronization.  */
+  uint64_t signal;
+  /* Private segment size.  */
+  uint32_t private_segment_size;
+  /* Group segment size.  */
+  uint32_t group_segment_size;
+};
+
+/* Structure of the kernargs segment, supporting console output.
+ 
+   This needs to match the definitions in Newlib, and the expectations
+   in libgomp target code.  */
+
+struct kernargs {
+  /* Leave space for the real kernel arguments.
+     OpenACC and OpenMP only use one pointer.  */
+  int64_t dummy1;
+  int64_t dummy2;
+
+  /* A pointer to struct output, below, for console output data.  */
+  int64_t out_ptr;
+
+  /* A pointer to struct heap, below.  */
+  int64_t heap_ptr;
+
+  /* A pointer to an ephemeral memory arena.
+    Only needed for OpenMP.  */
+  int64_t arena_ptr;
+
+  /* Output data.  */
+  struct output {
+    int return_value;
+    unsigned int next_output;
+    struct printf_data {
+      int written;
+      char msg[128];
+      int type;
+      union {
+	int64_t ivalue;
+	double dvalue;
+	char text[128];
+      };
+    } queue[1024];
+    unsigned int consumed;
+  } output_data;
+};
+
+/* A queue entry for a future asynchronous launch.  */
+
+struct kernel_launch
+{
+  struct kernel_info *kernel;
+  void *vars;
+  struct GOMP_kernel_launch_attributes kla;
+};
+
+/* A queue entry for a future callback.  */
+
+struct callback
+{
+  void (*fn)(void *);
+  void *data;
+};
+
+/* A data struct for the copy_data callback.  */
+
+struct copy_data
+{
+  void *dst;
+  const void *src;
+  size_t len;
+  bool free_src;
+  struct goacc_asyncqueue *aq;
+};
+
+/* A queue entry for a placeholder.  These correspond to a wait event.  */
+
+struct placeholder
+{
+  int executed;
+  pthread_cond_t cond;
+  pthread_mutex_t mutex;
+};
+
+/* A queue entry for a wait directive.  */
+
+struct asyncwait_info
+{
+  struct placeholder *placeholderp;
+};
+
+/* Encode the type of an entry in an async queue.  */
+
+enum entry_type
+{
+  KERNEL_LAUNCH,
+  CALLBACK,
+  ASYNC_WAIT,
+  ASYNC_PLACEHOLDER
+};
+
+/* An entry in an async queue.  */
+
+struct queue_entry
+{
+  enum entry_type type;
+  union {
+    struct kernel_launch launch;
+    struct callback callback;
+    struct asyncwait_info asyncwait;
+    struct placeholder placeholder;
+  } u;
+};
+
+/* An async queue header.
+
+   OpenMP may create one of these.
+   OpenACC may create many.  */
+
+struct goacc_asyncqueue
+{
+  struct agent_info *agent;
+  hsa_queue_t *hsa_queue;
+
+  pthread_t thread_drain_queue;
+  pthread_mutex_t mutex;
+  pthread_cond_t queue_cond_in;
+  pthread_cond_t queue_cond_out;
+  struct queue_entry queue[ASYNC_QUEUE_SIZE];
+  int queue_first;
+  int queue_n;
+  int drain_queue_stop;
+
+  int id;
+  struct goacc_asyncqueue *prev;
+  struct goacc_asyncqueue *next;
+};
+
+/* Mkoffload uses this structure to describe a kernel.
+
+   OpenMP kernel dimensions are passed at runtime.
+   OpenACC kernel dimensions are passed at compile time, here.  */
+
+struct hsa_kernel_description
+{
+  const char *name;
+  int oacc_dims[3];  /* Only present for GCN kernels.  */
+};
+
+/* Mkoffload uses this structure to describe an offload variable.  */
+
+struct global_var_info
+{
+  const char *name;
+  void *address;
+};
+
+/* Mkoffload uses this structure to describe all the kernels in a
+   loadable module.  These are passed the libgomp via static constructors.  */
+
+struct gcn_image_desc
+{
+  struct gcn_image {
+    size_t size;
+    void *image;
+  } *gcn_image;
+  const unsigned kernel_count;
+  struct hsa_kernel_description *kernel_infos;
+  const unsigned global_variable_count;
+  struct global_var_info *global_variables;
+};
+
+/* Description of an HSA GPU agent (device) and the program associated with
+   it.  */
+
+struct agent_info
+{
+  /* The HSA ID of the agent.  Assigned when hsa_context is initialized.  */
+  hsa_agent_t id;
+  /* The user-visible device number.  */
+  int device_id;
+  /* Whether the agent has been initialized.  The fields below are usable only
+     if it has been.  */
+  bool initialized;
+  /* Precomuted check for problem architectures.  */
+  bool gfx900_p;
+
+  /* Command queues of the agent.  */
+  hsa_queue_t *sync_queue;
+  struct goacc_asyncqueue *async_queues, *omp_async_queue;
+  pthread_mutex_t async_queues_mutex;
+
+  /* The HSA memory region from which to allocate kernel arguments.  */
+  hsa_region_t kernarg_region;
+
+  /* The HSA memory region from which to allocate device data.  */
+  hsa_region_t data_region;
+
+  /* Allocated team arenas.  */
+  struct team_arena_list *team_arena_list;
+  pthread_mutex_t team_arena_write_lock;
+
+  /* Read-write lock that protects kernels which are running or about to be run
+     from interference with loading and unloading of images.  Needs to be
+     locked for reading while a kernel is being run, and for writing if the
+     list of modules is manipulated (and thus the HSA program invalidated).  */
+  pthread_rwlock_t module_rwlock;
+
+  /* The module associated with this kernel.  */
+  struct module_info *module;
+
+  /* Mutex enforcing that only one thread will finalize the HSA program.  A
+     thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t prog_mutex;
+  /* Flag whether the HSA program that consists of all the modules has been
+     finalized.  */
+  bool prog_finalized;
+  /* HSA executable - the finalized program that is used to locate kernels.  */
+  hsa_executable_t executable;
+};
+
+/* Information required to identify, finalize and run any given kernel.  */
+
+enum offload_kind {KIND_UNKNOWN, KIND_OPENMP, KIND_OPENACC};
+
+struct kernel_info
+{
+  /* Name of the kernel, required to locate it within the GCN object-code
+     module.  */
+  const char *name;
+  /* The specific agent the kernel has been or will be finalized for and run
+     on.  */
+  struct agent_info *agent;
+  /* The specific module where the kernel takes place.  */
+  struct module_info *module;
+  /* Mutex enforcing that at most once thread ever initializes a kernel for
+     use.  A thread should have locked agent->module_rwlock for reading before
+     acquiring it.  */
+  pthread_mutex_t init_mutex;
+  /* Flag indicating whether the kernel has been initialized and all fields
+     below it contain valid data.  */
+  bool initialized;
+  /* Flag indicating that the kernel has a problem that blocks an execution.  */
+  bool initialization_failed;
+  /* The object to be put into the dispatch queue.  */
+  uint64_t object;
+  /* Required size of kernel arguments.  */
+  uint32_t kernarg_segment_size;
+  /* Required size of group segment.  */
+  uint32_t group_segment_size;
+  /* Required size of private segment.  */
+  uint32_t private_segment_size;
+  /* Set up for OpenMP or OpenACC?  */
+  enum offload_kind kind;
+};
+
+/* Information about a particular GCN module, its image and kernels.  */
+
+struct module_info
+{
+  /* The description with which the program has registered the image.  */
+  struct gcn_image_desc *image_desc;
+  /* GCN heap allocation.  */
+  struct heap *heap;
+  /* Physical boundaries of the loaded module.  */
+  Elf64_Addr phys_address_start;
+  Elf64_Addr phys_address_end;
+
+  bool constructors_run_p;
+  struct kernel_info *init_array_func, *fini_array_func;
+
+  /* Number of kernels in this module.  */
+  int kernel_count;
+  /* An array of kernel_info structures describing each kernel in this
+     module.  */
+  struct kernel_info kernels[];
+};
+
+/* A linked list of memory arenas allocated on the device.
+   These are only used by OpenMP, as a means to optimize per-team malloc.  */
+
+struct team_arena_list
+{
+  struct team_arena_list *next;
+
+  /* The number of teams determines the size of the allocation.  */
+  int num_teams;
+  /* The device address of the arena itself.  */
+  void *arena;
+  /* A flag to prevent two asynchronous kernels trying to use the same arena.
+     The mutex is locked until the kernel exits.  */
+  pthread_mutex_t in_use;
+};
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+struct hsa_context_info
+{
+  /* Whether the structure has been initialized.  */
+  bool initialized;
+  /* Number of usable GPU HSA agents in the system.  */
+  int agent_count;
+  /* Array of agent_info structures describing the individual HSA agents.  */
+  struct agent_info *agents;
+};
+
+/* Format of the on-device heap.
+
+   This must match the definition in Newlib and gcn-run.  */
+
+struct heap {
+  int64_t size;
+  char data[0];
+};
+
+/* }}}  */
+/* {{{ Global variables  */
+
+/* Information about the whole HSA environment and all of its agents.  */
+
+static struct hsa_context_info hsa_context;
+
+/* HSA runtime functions that are initialized in init_hsa_context.  */
+
+static struct hsa_runtime_fn_info hsa_fns;
+
+/* Heap space, allocated target-side, provided for use of newlib malloc.
+   Each module should have it's own heap allocated.
+   Beware that heap usage increases with OpenMP teams.  See also arenas.  */
+
+static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
+
+/* Flag to decide whether print to stderr information about what is going on.
+   Set in init_debug depending on environment variables.  */
+
+static bool debug;
+
+/* Flag to decide if the runtime should suppress a possible fallback to host
+   execution.  */
+
+static bool suppress_host_fallback;
+
+/* Flag to locate HSA runtime shared library that is dlopened
+   by this plug-in.  */
+
+static const char *hsa_runtime_lib;
+
+/* Flag to decide if the runtime should support also CPU devices (can be
+   a simulator).  */
+
+static bool support_cpu_devices;
+
+/* Runtime dimension overrides.  Zero indicates default.  */
+
+static int override_x_dim = 0;
+static int override_z_dim = 0;
+
+/* }}}  */
+/* {{{ Debug & Diagnostic  */
+
+/* Print a message to stderr if GCN_DEBUG value is set to true.  */
+
+#define DEBUG_PRINT(...) \
+  do \
+  { \
+    if (debug) \
+      { \
+	fprintf (stderr, __VA_ARGS__); \
+      } \
+  } \
+  while (false);
+
+/* Flush stderr if GCN_DEBUG value is set to true.  */
+
+#define DEBUG_FLUSH()				\
+  do {						\
+    if (debug)					\
+      fflush (stderr);				\
+  } while (false)
+
+/* Print a logging message with PREFIX to stderr if GCN_DEBUG value
+   is set to true.  */
+
+#define DEBUG_LOG(prefix, ...)			\
+  do						\
+    {						\
+      DEBUG_PRINT (prefix);			\
+      DEBUG_PRINT (__VA_ARGS__);			\
+      DEBUG_FLUSH ();				\
+    } while (false)
+
+/* Print a debugging message to stderr.  */
+
+#define GCN_DEBUG(...) DEBUG_LOG ("GCN debug: ", __VA_ARGS__)
+
+/* Print a warning message to stderr.  */
+
+#define GCN_WARNING(...) DEBUG_LOG ("GCN warning: ", __VA_ARGS__)
+
+/* Print HSA warning STR with an HSA STATUS code.  */
+
+static void
+hsa_warn (const char *str, hsa_status_t status)
+{
+  if (!debug)
+    return;
+
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+
+  fprintf (stderr, "GCN warning: %s\nRuntime message: %s\n", str,
+	   hsa_error_msg);
+}
+
+/* Report a fatal error STR together with the HSA error corresponding to STATUS
+   and terminate execution of the current process.  */
+
+static void
+hsa_fatal (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_fatal ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+}
+
+/* Like hsa_fatal, except only report error message, and return FALSE
+   for propagating error processing to outside of plugin.  */
+
+static bool
+hsa_error (const char *str, hsa_status_t status)
+{
+  const char *hsa_error_msg = "[unknown]";
+  hsa_fns.hsa_status_string_fn (status, &hsa_error_msg);
+  GOMP_PLUGIN_error ("GCN fatal error: %s\nRuntime message: %s\n", str,
+		     hsa_error_msg);
+  return false;
+}
+
+/* Dump information about the available hardware.  */
+
+static void
+dump_hsa_system_info (void)
+{
+  hsa_status_t status;
+
+  hsa_endianness_t endianness;
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_ENDIANNESS,
+					   &endianness);
+  if (status == HSA_STATUS_SUCCESS)
+    switch (endianness)
+      {
+      case HSA_ENDIANNESS_LITTLE:
+	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: LITTLE\n");
+	break;
+      case HSA_ENDIANNESS_BIG:
+	GCN_DEBUG ("HSA_SYSTEM_INFO_ENDIANNESS: BIG\n");
+	break;
+      default:
+	GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: UNKNOWN\n");
+      }
+  else
+    GCN_WARNING ("HSA_SYSTEM_INFO_ENDIANNESS: FAILED\n");
+
+  uint8_t extensions[128];
+  status = hsa_fns.hsa_system_get_info_fn (HSA_SYSTEM_INFO_EXTENSIONS,
+					   &extensions);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (extensions[0] & (1 << HSA_EXTENSION_IMAGES))
+	GCN_DEBUG ("HSA_SYSTEM_INFO_EXTENSIONS: IMAGES\n");
+    }
+  else
+    GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n");
+}
+
+/* Dump information about the available hardware.  */
+
+static void
+dump_machine_model (hsa_machine_model_t machine_model, const char *s)
+{
+  switch (machine_model)
+    {
+    case HSA_MACHINE_MODEL_SMALL:
+      GCN_DEBUG ("%s: SMALL\n", s);
+      break;
+    case HSA_MACHINE_MODEL_LARGE:
+      GCN_DEBUG ("%s: LARGE\n", s);
+      break;
+    default:
+      GCN_WARNING ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+/* Dump information about the available hardware.  */
+
+static void
+dump_profile (hsa_profile_t profile, const char *s)
+{
+  switch (profile)
+    {
+    case HSA_PROFILE_FULL:
+      GCN_DEBUG ("%s: FULL\n", s);
+      break;
+    case HSA_PROFILE_BASE:
+      GCN_DEBUG ("%s: BASE\n", s);
+      break;
+    default:
+      GCN_WARNING ("%s: UNKNOWN\n", s);
+      break;
+    }
+}
+
+/* Dump information about a device memory region.  */
+
+static hsa_status_t
+dump_hsa_region (hsa_region_t region, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  hsa_region_segment_t segment;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      if (segment == HSA_REGION_SEGMENT_GLOBAL)
+	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GLOBAL\n");
+      else if (segment == HSA_REGION_SEGMENT_READONLY)
+	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: READONLY\n");
+      else if (segment == HSA_REGION_SEGMENT_PRIVATE)
+	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: PRIVATE\n");
+      else if (segment == HSA_REGION_SEGMENT_GROUP)
+	GCN_DEBUG ("HSA_REGION_INFO_SEGMENT: GROUP\n");
+      else
+	GCN_WARNING ("HSA_REGION_INFO_SEGMENT: UNKNOWN\n");
+    }
+  else
+    GCN_WARNING ("HSA_REGION_INFO_SEGMENT: FAILED\n");
+
+  if (segment == HSA_REGION_SEGMENT_GLOBAL)
+    {
+      uint32_t flags;
+      status
+	= hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					  &flags);
+      if (status == HSA_STATUS_SUCCESS)
+	{
+	  if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG)
+	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: KERNARG\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED)
+	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: FINE_GRAINED\n");
+	  if (flags & HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED)
+	    GCN_DEBUG ("HSA_REGION_INFO_GLOBAL_FLAGS: COARSE_GRAINED\n");
+	}
+      else
+	GCN_WARNING ("HSA_REGION_INFO_GLOBAL_FLAGS: FAILED\n");
+    }
+
+  size_t size;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SIZE, &size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_REGION_INFO_SIZE: %zu\n", size);
+  else
+    GCN_WARNING ("HSA_REGION_INFO_SIZE: FAILED\n");
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_ALLOC_MAX_SIZE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_REGION_INFO_ALLOC_MAX_SIZE: %zu\n", size);
+  else
+    GCN_WARNING ("HSA_REGION_INFO_ALLOC_MAX_SIZE: FAILED\n");
+
+  bool alloc_allowed;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED,
+				      &alloc_allowed);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: %u\n", alloc_allowed);
+  else
+    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALLOWED: FAILED\n");
+
+  if (status != HSA_STATUS_SUCCESS || !alloc_allowed)
+    return HSA_STATUS_SUCCESS;
+
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE,
+				      &size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: %zu\n", size);
+  else
+    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_GRANULE: FAILED\n");
+
+  size_t align;
+  status
+    = hsa_fns.hsa_region_get_info_fn (region,
+				      HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT,
+				      &align);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: %zu\n", align);
+  else
+    GCN_WARNING ("HSA_REGION_INFO_RUNTIME_ALLOC_ALIGNMENT: FAILED\n");
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Dump information about all the device memory regions.  */
+
+static void
+dump_hsa_regions (hsa_agent_t agent)
+{
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent,
+						 dump_hsa_region,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_error ("Dumping hsa regions failed", status);
+}
+
+/* Dump information about the available devices.  */
+
+static hsa_status_t
+dump_hsa_agent_info (hsa_agent_t agent, void *data __attribute__((unused)))
+{
+  hsa_status_t status;
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_NAME: %s\n", buf);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_NAME: FAILED\n");
+
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_VENDOR_NAME,
+					  &buf);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_VENDOR_NAME: %s\n", buf);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_VENDOR_NAME: FAILED\n");
+
+  hsa_machine_model_t machine_model;
+  status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_MACHINE_MODEL,
+				     &machine_model);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_machine_model (machine_model, "HSA_AGENT_INFO_MACHINE_MODEL");
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_MACHINE_MODEL: FAILED\n");
+
+  hsa_profile_t profile;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_PROFILE,
+					  &profile);
+  if (status == HSA_STATUS_SUCCESS)
+    dump_profile (profile, "HSA_AGENT_INFO_PROFILE");
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_PROFILE: FAILED\n");
+
+  hsa_device_type_t device_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+					  &device_type);
+  if (status == HSA_STATUS_SUCCESS)
+    {
+      switch (device_type)
+	{
+	case HSA_DEVICE_TYPE_CPU:
+	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: CPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_GPU:
+	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: GPU\n");
+	  break;
+	case HSA_DEVICE_TYPE_DSP:
+	  GCN_DEBUG ("HSA_AGENT_INFO_DEVICE: DSP\n");
+	  break;
+	default:
+	  GCN_WARNING ("HSA_AGENT_INFO_DEVICE: UNKNOWN\n");
+	  break;
+	}
+    }
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_DEVICE: FAILED\n");
+
+  uint32_t cu_count;
+  status = hsa_fns.hsa_agent_get_info_fn
+    (agent, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: %u\n", cu_count);
+  else
+    GCN_WARNING ("HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT: FAILED\n");
+
+  uint32_t size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_WAVEFRONT_SIZE,
+					  &size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_WAVEFRONT_SIZE: %u\n", size);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_WAVEFRONT_SIZE: FAILED\n");
+
+  uint32_t max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
+					  &max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: %u\n", max_dim);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_DIM: FAILED\n");
+
+  uint32_t max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent,
+					  HSA_AGENT_INFO_WORKGROUP_MAX_SIZE,
+					  &max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: %u\n", max_size);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_WORKGROUP_MAX_SIZE: FAILED\n");
+
+  uint32_t grid_max_dim;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_DIM,
+					  &grid_max_dim);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_DIM: %u\n", grid_max_dim);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_DIM: FAILED\n");
+
+  uint32_t grid_max_size;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_GRID_MAX_SIZE,
+					  &grid_max_size);
+  if (status == HSA_STATUS_SUCCESS)
+    GCN_DEBUG ("HSA_AGENT_INFO_GRID_MAX_SIZE: %u\n", grid_max_size);
+  else
+    GCN_WARNING ("HSA_AGENT_INFO_GRID_MAX_SIZE: FAILED\n");
+
+  dump_hsa_regions (agent);
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Forward reference.  */
+
+static char *get_executable_symbol_name (hsa_executable_symbol_t symbol);
+
+/* Helper function for dump_executable_symbols.  */
+
+static hsa_status_t
+dump_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t symbol,
+			void *data __attribute__((unused)))
+{
+  char *name = get_executable_symbol_name (symbol);
+
+  if (name)
+    {
+      GCN_DEBUG ("executable symbol: %s\n", name);
+      free (name);
+    }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Dump all global symbol in an executable.  */
+
+static void
+dump_executable_symbols (hsa_executable_t executable)
+{
+  hsa_status_t status;
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 dump_executable_symbol,
+						 NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not dump HSA executable symbols", status);
+}
+
+/* Dump kernel DISPATCH data structure and indent it by INDENT spaces.  */
+
+static void
+print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
+{
+  struct kernargs *kernargs = (struct kernargs *)dispatch->kernarg_address;
+
+  fprintf (stderr, "%*sthis: %p\n", indent, "", dispatch);
+  fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
+  fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
+  fprintf (stderr, "%*sheap address: %p\n", indent, "",
+	   (void*)kernargs->heap_ptr);
+  fprintf (stderr, "%*sarena address: %p\n", indent, "",
+	   (void*)kernargs->arena_ptr);
+  fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
+  fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
+	   dispatch->private_segment_size);
+  fprintf (stderr, "%*sgroup_segment_size: %u\n", indent, "",
+	   dispatch->group_segment_size);
+  fprintf (stderr, "\n");
+}
+
+/* }}}  */
+/* {{{ Utility functions  */
+
+/* Cast the thread local storage to gcn_thread.  */
+
+static inline struct gcn_thread *
+gcn_thread (void)
+{
+  return (struct gcn_thread *) GOMP_PLUGIN_acc_thread ();
+}
+
+/* Initialize debug and suppress_host_fallback according to the environment.  */
+
+static void
+init_environment_variables (void)
+{
+  if (secure_getenv ("GCN_DEBUG"))
+    debug = true;
+  else
+    debug = false;
+
+  if (secure_getenv ("GCN_SUPPRESS_HOST_FALLBACK"))
+    suppress_host_fallback = true;
+  else
+    suppress_host_fallback = false;
+
+  hsa_runtime_lib = secure_getenv ("HSA_RUNTIME_LIB");
+  if (hsa_runtime_lib == NULL)
+    hsa_runtime_lib = HSA_RUNTIME_LIB "libhsa-runtime64.so";
+
+  support_cpu_devices = secure_getenv ("GCN_SUPPORT_CPU_DEVICES");
+
+  const char *x = secure_getenv ("GCN_NUM_TEAMS");
+  if (!x)
+    x = secure_getenv ("GCN_NUM_GANGS");
+  if (x)
+    override_x_dim = atoi (x);
+
+  const char *z = secure_getenv ("GCN_NUM_THREADS");
+  if (!z)
+    z = secure_getenv ("GCN_NUM_WORKERS");
+  if (z)
+    override_z_dim = atoi (z);
+
+  const char *heap = secure_getenv ("GCN_HEAP_SIZE");
+  if (heap)
+    {
+      size_t tmp = atol (heap);
+      if (tmp)
+	gcn_kernel_heap_size = tmp;
+    }
+}
+
+/* Return malloc'd string with name of SYMBOL.  */
+
+static char *
+get_executable_symbol_name (hsa_executable_symbol_t symbol)
+{
+  hsa_status_t status;
+  char *res;
+  uint32_t len;
+  const hsa_executable_symbol_info_t info_name_length
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name_length,
+						      &len);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get length of symbol name", status);
+      return NULL;
+    }
+
+  res = GOMP_PLUGIN_malloc (len + 1);
+
+  const hsa_executable_symbol_info_t info_name
+    = HSA_EXECUTABLE_SYMBOL_INFO_NAME;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn (symbol, info_name, res);
+
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not get symbol name", status);
+      free (res);
+      return NULL;
+    }
+
+  res[len] = '\0';
+
+  return res;
+}
+
+/* Helper function for find_executable_symbol.  */
+
+static hsa_status_t
+find_executable_symbol_1 (hsa_executable_t executable,
+			  hsa_executable_symbol_t symbol,
+			  void *data)
+{
+  hsa_executable_symbol_t *res = (hsa_executable_symbol_t *)data;
+  *res = symbol;
+  return HSA_STATUS_INFO_BREAK;
+}
+
+/* Find a global symbol in EXECUTABLE, save to *SYMBOL and return true.  If not
+   found, return false.  */
+
+static bool
+find_executable_symbol (hsa_executable_t executable,
+			hsa_executable_symbol_t *symbol)
+{
+  hsa_status_t status;
+
+  status
+    = hsa_fns.hsa_executable_iterate_symbols_fn (executable,
+						 find_executable_symbol_1,
+						 symbol);
+  if (status != HSA_STATUS_INFO_BREAK)
+    {
+      hsa_error ("Could not find executable symbol", status);
+      return false;
+    }
+
+  return true;
+}
+
+/* Get the number of GPU Compute Units.  */
+
+static int
+get_cu_count (struct agent_info *agent)
+{
+  uint32_t cu_count;
+  hsa_status_t status = hsa_fns.hsa_agent_get_info_fn
+    (agent->id, HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, &cu_count);
+  if (status == HSA_STATUS_SUCCESS)
+    return cu_count;
+  else
+    return 64;  /* The usual number for older devices.  */
+}
+
+/* Calculate the maximum grid size for OMP threads / OACC workers.
+   This depends on the kernel's resource usage levels.  */
+
+static int
+limit_worker_threads (int threads)
+{
+  /* FIXME Do something more inteligent here.
+     GCN can always run 4 threads within a Compute Unit, but
+     more than that depends on register usage.  */
+  if (threads > 16)
+    threads = 16;
+  return threads;
+}
+
+/* Parse the target attributes INPUT provided by the compiler and return true
+   if we should run anything all.  If INPUT is NULL, fill DEF with default
+   values, then store INPUT or DEF into *RESULT.
+ 
+   This is used for OpenMP only.  */
+
+static bool
+parse_target_attributes (void **input,
+			 struct GOMP_kernel_launch_attributes *def,
+			 struct GOMP_kernel_launch_attributes **result,
+			 struct agent_info *agent)
+{
+  if (!input)
+    GOMP_PLUGIN_fatal ("No target arguments provided");
+
+  bool grid_attrs_found = false;
+  bool gcn_dims_found = false;
+  int gcn_teams = 0;
+  int gcn_threads = 0;
+  while (*input)
+    {
+      intptr_t id = (intptr_t) *input++, val;
+
+      if (id & GOMP_TARGET_ARG_SUBSEQUENT_PARAM)
+	val = (intptr_t) *input++;
+      else
+	val = id >> GOMP_TARGET_ARG_VALUE_SHIFT;
+
+      val = (val > INT_MAX) ? INT_MAX : val;
+
+      if ((id & GOMP_TARGET_ARG_DEVICE_MASK) == GOMP_DEVICE_GCN
+	  && ((id & GOMP_TARGET_ARG_ID_MASK)
+	      == GOMP_TARGET_ARG_HSA_KERNEL_ATTRIBUTES))
+	{
+	  grid_attrs_found = true;
+	  break;
+	}
+      else if ((id & GOMP_TARGET_ARG_DEVICE_ALL) == GOMP_TARGET_ARG_DEVICE_ALL)
+	{
+	  gcn_dims_found = true;
+	  switch (id & GOMP_TARGET_ARG_ID_MASK)
+	    {
+	    case GOMP_TARGET_ARG_NUM_TEAMS:
+	      gcn_teams = val;
+	      break;
+	    case GOMP_TARGET_ARG_THREAD_LIMIT:
+	      gcn_threads = limit_worker_threads (val);
+	      break;
+	    default:
+	      ;
+	    }
+	}
+    }
+
+  if (gcn_dims_found)
+    {
+      if (agent->gfx900_p && gcn_threads == 0 && override_z_dim == 0)
+	{
+	  gcn_threads = 4;
+	  GCN_WARNING ("VEGA BUG WORKAROUND: reducing default number of "
+		       "threads to 4 per team.\n");
+	  GCN_WARNING (" - If this is not a Vega 10 device, please use "
+		       "GCN_NUM_THREADS=16\n");
+	}
+
+      def->ndim = 3;
+      /* Fiji has 64 CUs, but Vega20 has 60.  */
+      def->gdims[0] = (gcn_teams > 0) ? gcn_teams : get_cu_count (agent);
+      /* Each thread is 64 work items wide.  */
+      def->gdims[1] = 64;
+      /* A work group can have 16 wavefronts.  */
+      def->gdims[2] = (gcn_threads > 0) ? gcn_threads : 16;
+      def->wdims[0] = 1; /* Single team per work-group.  */
+      def->wdims[1] = 64;
+      def->wdims[2] = 16;
+      *result = def;
+      return true;
+    }
+  else if (!grid_attrs_found)
+    {
+      def->ndim = 1;
+      def->gdims[0] = 1;
+      def->gdims[1] = 1;
+      def->gdims[2] = 1;
+      def->wdims[0] = 1;
+      def->wdims[1] = 1;
+      def->wdims[2] = 1;
+      *result = def;
+      GCN_WARNING ("GOMP_OFFLOAD_run called with no launch attributes\n");
+      return true;
+    }
+
+  struct GOMP_kernel_launch_attributes *kla;
+  kla = (struct GOMP_kernel_launch_attributes *) *input;
+  *result = kla;
+  if (kla->ndim == 0 || kla->ndim > 3)
+    GOMP_PLUGIN_fatal ("Invalid number of dimensions (%u)", kla->ndim);
+
+  GCN_DEBUG ("GOMP_OFFLOAD_run called with %u dimensions:\n", kla->ndim);
+  unsigned i;
+  for (i = 0; i < kla->ndim; i++)
+    {
+      GCN_DEBUG ("  Dimension %u: grid size %u and group size %u\n", i,
+		 kla->gdims[i], kla->wdims[i]);
+      if (kla->gdims[i] == 0)
+	return false;
+    }
+  return true;
+}
+
+/* Return the group size given the requested GROUP size, GRID size and number
+   of grid dimensions NDIM.  */
+
+static uint32_t
+get_group_size (uint32_t ndim, uint32_t grid, uint32_t group)
+{
+  if (group == 0)
+    {
+      /* TODO: Provide a default via environment or device characteristics.  */
+      if (ndim == 1)
+	group = 64;
+      else if (ndim == 2)
+	group = 8;
+      else
+	group = 4;
+    }
+
+  if (group > grid)
+    group = grid;
+  return group;
+}
+
+/* Atomically store pair of uint16_t values (HEADER and REST) to a PACKET.  */
+
+static void
+packet_store_release (uint32_t* packet, uint16_t header, uint16_t rest)
+{
+  __atomic_store_n (packet, header | (rest << 16), __ATOMIC_RELEASE);
+}
+
+/* A never-called callback for the HSA command queues.  These signal events
+   that we don't use, so we trigger an error.
+ 
+   This "queue" is not to be confused with the async queues, below.  */
+
+static void
+hsa_queue_callback (hsa_status_t status,
+		hsa_queue_t *queue __attribute__ ((unused)),
+		void *data __attribute__ ((unused)))
+{
+  hsa_fatal ("Asynchronous queue error", status);
+}
+
+/* }}}  */
+/* {{{ HSA initialization  */
+
+/* Populate hsa_fns with the function addresses from libhsa-runtime64.so.  */
+
+static bool
+init_hsa_runtime_functions (void)
+{
+#define DLSYM_FN(function) \
+  hsa_fns.function##_fn = dlsym (handle, #function); \
+  if (hsa_fns.function##_fn == NULL) \
+    return false;
+  void *handle = dlopen (hsa_runtime_lib, RTLD_LAZY);
+  if (handle == NULL)
+    return false;
+
+  DLSYM_FN (hsa_status_string)
+  DLSYM_FN (hsa_system_get_info)
+  DLSYM_FN (hsa_agent_get_info)
+  DLSYM_FN (hsa_init)
+  DLSYM_FN (hsa_iterate_agents)
+  DLSYM_FN (hsa_region_get_info)
+  DLSYM_FN (hsa_queue_create)
+  DLSYM_FN (hsa_agent_iterate_regions)
+  DLSYM_FN (hsa_executable_destroy)
+  DLSYM_FN (hsa_executable_create)
+  DLSYM_FN (hsa_executable_global_variable_define)
+  DLSYM_FN (hsa_executable_load_code_object)
+  DLSYM_FN (hsa_executable_freeze)
+  DLSYM_FN (hsa_signal_create)
+  DLSYM_FN (hsa_memory_allocate)
+  DLSYM_FN (hsa_memory_assign_agent)
+  DLSYM_FN (hsa_memory_copy)
+  DLSYM_FN (hsa_memory_free)
+  DLSYM_FN (hsa_signal_destroy)
+  DLSYM_FN (hsa_executable_get_symbol)
+  DLSYM_FN (hsa_executable_symbol_get_info)
+  DLSYM_FN (hsa_executable_iterate_symbols)
+  DLSYM_FN (hsa_queue_add_write_index_release)
+  DLSYM_FN (hsa_queue_load_read_index_acquire)
+  DLSYM_FN (hsa_signal_wait_acquire)
+  DLSYM_FN (hsa_signal_store_relaxed)
+  DLSYM_FN (hsa_signal_store_release)
+  DLSYM_FN (hsa_signal_load_acquire)
+  DLSYM_FN (hsa_queue_destroy)
+  DLSYM_FN (hsa_code_object_deserialize)
+  return true;
+#undef DLSYM_FN
+}
+
+/* Return true if the agent is a GPU and can accept of concurrent submissions
+   from different threads.  */
+
+static bool
+suitable_hsa_agent_p (hsa_agent_t agent)
+{
+  hsa_device_type_t device_type;
+  hsa_status_t status
+    = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_DEVICE,
+				     &device_type);
+  if (status != HSA_STATUS_SUCCESS)
+    return false;
+
+  switch (device_type)
+    {
+    case HSA_DEVICE_TYPE_GPU:
+      break;
+    case HSA_DEVICE_TYPE_CPU:
+      if (!support_cpu_devices)
+	return false;
+      break;
+    default:
+      return false;
+    }
+
+  uint32_t features = 0;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_FEATURE,
+					  &features);
+  if (status != HSA_STATUS_SUCCESS
+      || !(features & HSA_AGENT_FEATURE_KERNEL_DISPATCH))
+    return false;
+  hsa_queue_type_t queue_type;
+  status = hsa_fns.hsa_agent_get_info_fn (agent, HSA_AGENT_INFO_QUEUE_TYPE,
+					  &queue_type);
+  if (status != HSA_STATUS_SUCCESS
+      || (queue_type != HSA_QUEUE_TYPE_MULTI))
+    return false;
+
+  return true;
+}
+
+/* Callback of hsa_iterate_agents; if AGENT is a GPU device, increment
+   agent_count in hsa_context.  */
+
+static hsa_status_t
+count_gpu_agents (hsa_agent_t agent, void *data __attribute__ ((unused)))
+{
+  if (suitable_hsa_agent_p (agent))
+    hsa_context.agent_count++;
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_iterate_agents; if AGENT is a GPU device, assign the agent
+   id to the describing structure in the hsa context.  The index of the
+   structure is pointed to by DATA, increment it afterwards.  */
+
+static hsa_status_t
+assign_agent_ids (hsa_agent_t agent, void *data)
+{
+  if (suitable_hsa_agent_p (agent))
+    {
+      int *agent_index = (int *) data;
+      hsa_context.agents[*agent_index].id = agent;
+      ++*agent_index;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Initialize hsa_context if it has not already been done.
+   Return TRUE on success.  */
+
+static bool
+init_hsa_context (void)
+{
+  hsa_status_t status;
+  int agent_index = 0;
+
+  if (hsa_context.initialized)
+    return true;
+  init_environment_variables ();
+  if (!init_hsa_runtime_functions ())
+    {
+      GCN_WARNING ("Run-time could not be dynamically opened\n");
+      if (suppress_host_fallback)
+	GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+      return false;
+    }
+  status = hsa_fns.hsa_init_fn ();
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Run-time could not be initialized", status);
+  GCN_DEBUG ("HSA run-time initialized for GCN\n");
+
+  if (debug)
+    dump_hsa_system_info ();
+
+  status = hsa_fns.hsa_iterate_agents_fn (count_gpu_agents, NULL);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("GCN GPU devices could not be enumerated", status);
+  GCN_DEBUG ("There are %i GCN GPU devices.\n", hsa_context.agent_count);
+
+  hsa_context.agents
+    = GOMP_PLUGIN_malloc_cleared (hsa_context.agent_count
+				  * sizeof (struct agent_info));
+  status = hsa_fns.hsa_iterate_agents_fn (assign_agent_ids, &agent_index);
+  if (agent_index != hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Failed to assign IDs to all GCN agents");
+      return false;
+    }
+
+  if (debug)
+    {
+      status = hsa_fns.hsa_iterate_agents_fn (dump_hsa_agent_info, NULL);
+      if (status != HSA_STATUS_SUCCESS)
+	GOMP_PLUGIN_error ("Failed to list all HSA runtime agents");
+    }
+
+  hsa_context.initialized = true;
+  return true;
+}
+
+/* Verify that hsa_context has already been initialized and return the
+   agent_info structure describing device number N.  Return NULL on error.  */
+
+static struct agent_info *
+get_agent_info (int n)
+{
+  if (!hsa_context.initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use uninitialized GCN context.");
+      return NULL;
+    }
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to operate on non-existent GCN device %i", n);
+      return NULL;
+    }
+  if (!hsa_context.agents[n].initialized)
+    {
+      GOMP_PLUGIN_error ("Attempt to use an uninitialized GCN agent.");
+      return NULL;
+    }
+  return &hsa_context.agents[n];
+}
+
+/* Callback of hsa_agent_iterate_regions, via get_*_memory_region functions.
+
+   Selects (breaks at) a suitable region of type KIND.  */
+
+static hsa_status_t
+get_memory_region (hsa_region_t region, hsa_region_t *retval,
+		   hsa_region_global_flag_t kind)
+{
+  hsa_status_t status;
+  hsa_region_segment_t segment;
+
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_SEGMENT,
+					   &segment);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (segment != HSA_REGION_SEGMENT_GLOBAL)
+    return HSA_STATUS_SUCCESS;
+
+  uint32_t flags;
+  status = hsa_fns.hsa_region_get_info_fn (region, HSA_REGION_INFO_GLOBAL_FLAGS,
+					   &flags);
+  if (status != HSA_STATUS_SUCCESS)
+    return status;
+  if (flags & kind)
+    {
+      *retval = region;
+      return HSA_STATUS_INFO_BREAK;
+    }
+  return HSA_STATUS_SUCCESS;
+}
+
+/* Callback of hsa_agent_iterate_regions.
+ 
+   Selects a kernargs memory region.  */
+
+static hsa_status_t
+get_kernarg_memory_region (hsa_region_t region, void *data)
+{
+  return get_memory_region (region, (hsa_region_t *)data,
+			    HSA_REGION_GLOBAL_FLAG_KERNARG);
+}
+
+/* Callback of hsa_agent_iterate_regions.
+
+   Selects a coarse-grained memory region suitable for the heap and
+   offload data.  */
+
+static hsa_status_t
+get_data_memory_region (hsa_region_t region, void *data)
+{
+  return get_memory_region (region, (hsa_region_t *)data,
+			    HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED);
+}
+
+/* }}}  */
+/* {{{ Run  */
+
+/* Create or reuse a team arena.
+ 
+   Team arenas are used by OpenMP to avoid calling malloc multiple times
+   while setting up each team.  This is purely a performance optimization.
+
+   Allocating an arena also costs performance, albeit on the host side, so
+   this function will reuse an existing arena if a large enough one is idle.
+   The arena is released, but not deallocated, when the kernel exits.  */
+
+static void *
+get_team_arena (struct agent_info *agent, int num_teams)
+{
+  struct team_arena_list **next_ptr = &agent->team_arena_list;
+  struct team_arena_list *item;
+
+  for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
+    {
+      if (item->num_teams < num_teams)
+	continue;
+
+      if (pthread_mutex_trylock (&item->in_use))
+	continue;
+
+      return item->arena;
+    }
+
+  GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
+
+  if (pthread_mutex_lock (&agent->team_arena_write_lock))
+    {
+      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+      return false;
+    }
+  item = malloc (sizeof (*item));
+  item->num_teams = num_teams;
+  item->next = NULL;
+  *next_ptr = item;
+
+  if (pthread_mutex_init (&item->in_use, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
+      return false;
+    }
+  if (pthread_mutex_lock (&item->in_use))
+    {
+      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_mutex_unlock (&agent->team_arena_write_lock))
+    {
+      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+      return false;
+    }
+
+  const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
+  hsa_status_t status;
+  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
+					   TEAM_ARENA_SIZE*num_teams,
+					   &item->arena);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
+  status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
+					       HSA_ACCESS_PERMISSION_RW);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not assign arena memory to device", status);
+
+  return item->arena;
+}
+
+/* Mark a team arena available for reuse.  */
+
+static void
+release_team_arena (struct agent_info* agent, void *arena)
+{
+  struct team_arena_list *item;
+
+  for (item = agent->team_arena_list; item; item = item->next)
+    {
+      if (item->arena == arena)
+	{
+	  if (pthread_mutex_unlock (&item->in_use))
+	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+	  return;
+	}
+    }
+  GOMP_PLUGIN_error ("Could not find a GCN arena to release.");
+}
+
+/* Clean up all the allocated team arenas.  */
+
+static bool
+destroy_team_arenas (struct agent_info *agent)
+{
+  struct team_arena_list *item, *next;
+
+  for (item = agent->team_arena_list; item; item = next)
+    {
+      next = item->next;
+      hsa_fns.hsa_memory_free_fn (item->arena);
+      if (pthread_mutex_destroy (&item->in_use))
+	{
+	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+	  return false;
+	}
+      free (item);
+    }
+  agent->team_arena_list = NULL;
+
+  return true;
+}
+
+/* Allocate memory on a specified device.  */
+
+static void *
+alloc_by_agent (struct agent_info *agent, size_t size)
+{
+  GCN_DEBUG ("Allocating %zu bytes on device %d\n", size, agent->device_id);
+
+  /* Zero-size allocations are invalid, so in order to return a valid pointer
+     we need to pass a valid size.  One source of zero-size allocations is
+     kernargs for kernels that have no inputs or outputs (the kernel may
+     only use console output, for example).  */
+  if (size == 0)
+    size = 4;
+
+  void *ptr;
+  hsa_status_t status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
+							size, &ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not allocate device memory", status);
+      return NULL;
+    }
+
+  status = hsa_fns.hsa_memory_assign_agent_fn (ptr, agent->id,
+					       HSA_ACCESS_PERMISSION_RW);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not assign data memory to device", status);
+      return NULL;
+    }
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_alloc;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = size;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) ptr;
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
+  return ptr;
+}
+
+/* Create kernel dispatch data structure for given KERNEL, along with
+   the necessary device signals and memory allocations.  */
+
+static struct kernel_dispatch *
+create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
+{
+  struct agent_info *agent = kernel->agent;
+  struct kernel_dispatch *shadow
+    = GOMP_PLUGIN_malloc_cleared (sizeof (struct kernel_dispatch));
+
+  shadow->agent = kernel->agent;
+  shadow->object = kernel->object;
+
+  hsa_signal_t sync_signal;
+  hsa_status_t status = hsa_fns.hsa_signal_create_fn (1, 0, NULL, &sync_signal);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating the GCN sync signal", status);
+
+  shadow->signal = sync_signal.handle;
+  shadow->private_segment_size = kernel->private_segment_size;
+  shadow->group_segment_size = kernel->group_segment_size;
+
+  /* We expect kernels to request a single pointer, explicitly, and the
+     rest of struct kernargs, implicitly.  If they request anything else
+     then something is wrong.  */
+  if (kernel->kernarg_segment_size > 8)
+    {
+      GOMP_PLUGIN_fatal ("Unexpectedly large kernargs segment requested");
+      return NULL;
+    }
+
+  status = hsa_fns.hsa_memory_allocate_fn (agent->kernarg_region,
+					   sizeof (struct kernargs),
+					   &shadow->kernarg_address);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not allocate memory for GCN kernel arguments", status);
+  struct kernargs *kernargs = shadow->kernarg_address;
+
+  /* Zero-initialize the output_data (minimum needed).  */
+  kernargs->out_ptr = (int64_t)&kernargs->output_data;
+  kernargs->output_data.next_output = 0;
+  for (unsigned i = 0;
+       i < (sizeof (kernargs->output_data.queue)
+	    / sizeof (kernargs->output_data.queue[0]));
+       i++)
+    kernargs->output_data.queue[i].written = 0;
+  kernargs->output_data.consumed = 0;
+
+  /* Pass in the heap location.  */
+  kernargs->heap_ptr = (int64_t)kernel->module->heap;
+
+  /* Create an arena.  */
+  if (kernel->kind == KIND_OPENMP)
+    kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
+  else
+    kernargs->arena_ptr = 0;
+
+  /* Ensure we can recognize unset return values.  */
+  kernargs->output_data.return_value = 0xcafe0000;
+
+  return shadow;
+}
+
+/* Output any data written to console output from the kernel.  It is expected
+   that this function is polled during kernel execution.
+
+   We print all entries from the last item printed to the next entry without
+   a "written" flag.  If the "final" flag is set then it'll continue right to
+   the end.
+ 
+   The print buffer is circular, but the from and to locations don't wrap when
+   the buffer does, so the output limit is UINT_MAX.  The target blocks on
+   output when the buffer is full.  */
+
+static void
+console_output (struct kernel_info *kernel, struct kernargs *kernargs,
+		bool final)
+{
+  unsigned int limit = (sizeof (kernargs->output_data.queue)
+			/ sizeof (kernargs->output_data.queue[0]));
+
+  unsigned int from = __atomic_load_n (&kernargs->output_data.consumed,
+				       __ATOMIC_ACQUIRE);
+  unsigned int to = kernargs->output_data.next_output;
+
+  if (from > to)
+    {
+      /* Overflow.  */
+      if (final)
+	printf ("GCN print buffer overflowed.\n");
+      return;
+    }
+
+  unsigned int i;
+  for (i = from; i < to; i++)
+    {
+      struct printf_data *data = &kernargs->output_data.queue[i%limit];
+
+      if (!data->written && !final)
+	break;
+
+      switch (data->type)
+	{
+	case 0: printf ("%.128s%ld\n", data->msg, data->ivalue); break;
+	case 1: printf ("%.128s%f\n", data->msg, data->dvalue); break;
+	case 2: printf ("%.128s%.128s\n", data->msg, data->text); break;
+	case 3: printf ("%.128s%.128s", data->msg, data->text); break;
+	default: printf ("GCN print buffer error!\n"); break;
+	}
+      data->written = 0;
+      __atomic_store_n (&kernargs->output_data.consumed, i+1,
+			__ATOMIC_RELEASE);
+    }
+  fflush (stdout);
+}
+
+/* Release data structure created for a kernel dispatch in SHADOW argument,
+   and clean up the signal and memory allocations.  */
+
+static void
+release_kernel_dispatch (struct kernel_dispatch *shadow)
+{
+  GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  void *arena = (void *)kernargs->arena_ptr;
+  if (arena)
+    release_team_arena (shadow->agent, arena);
+
+  hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
+
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  hsa_fns.hsa_signal_destroy_fn (s);
+
+  free (shadow);
+}
+
+/* Extract the properties from a kernel binary.  */
+
+static void
+init_kernel_properties (struct kernel_info *kernel)
+{
+  hsa_status_t status;
+  struct agent_info *agent = kernel->agent;
+  hsa_executable_symbol_t kernel_symbol;
+  status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						 kernel->name, agent->id,
+						 0, &kernel_symbol);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_warn ("Could not find symbol for kernel in the code object", status);
+      fprintf (stderr, "not found name: '%s'\n", kernel->name);
+      dump_executable_symbols (agent->executable);
+      goto failure;
+    }
+  GCN_DEBUG ("Located kernel %s\n", kernel->name);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel->object);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not extract a kernel object from its symbol", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+     &kernel->kernarg_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel argument size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+     &kernel->group_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel group segment size", status);
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+     &kernel->private_segment_size);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Could not get info about kernel private segment size",
+	       status);
+
+  /* The kernel type is not known until something tries to launch it.  */
+  kernel->kind = KIND_UNKNOWN;
+
+  GCN_DEBUG ("Kernel structure for %s fully initialized with "
+	     "following segment sizes: \n", kernel->name);
+  GCN_DEBUG ("  group_segment_size: %u\n",
+	     (unsigned) kernel->group_segment_size);
+  GCN_DEBUG ("  private_segment_size: %u\n",
+	     (unsigned) kernel->private_segment_size);
+  GCN_DEBUG ("  kernarg_segment_size: %u\n",
+	     (unsigned) kernel->kernarg_segment_size);
+  return;
+
+failure:
+  kernel->initialization_failed = true;
+}
+
+/* Do all the work that is necessary before running KERNEL for the first time.
+   The function assumes the program has been created, finalized and frozen by
+   create_and_finalize_hsa_program.  */
+
+static void
+init_kernel (struct kernel_info *kernel)
+{
+  if (pthread_mutex_lock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not lock a GCN kernel initialization mutex");
+  if (kernel->initialized)
+    {
+      if (pthread_mutex_unlock (&kernel->init_mutex))
+	GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+			   "mutex");
+
+      return;
+    }
+
+  init_kernel_properties (kernel);
+
+  if (!kernel->initialization_failed)
+    {
+      GCN_DEBUG ("\n");
+
+      kernel->initialized = true;
+    }
+  if (pthread_mutex_unlock (&kernel->init_mutex))
+    GOMP_PLUGIN_fatal ("Could not unlock a GCN kernel initialization "
+		       "mutex");
+}
+
+/* Run KERNEL on its agent, pass VARS to it as arguments and take
+   launch attributes from KLA.
+   
+   MODULE_LOCKED indicates that the caller already holds the lock and
+   run_kernel need not lock it again.
+   If AQ is NULL then agent->sync_queue will be used.  */
+
+static void
+run_kernel (struct kernel_info *kernel, void *vars,
+	    struct GOMP_kernel_launch_attributes *kla,
+	    struct goacc_asyncqueue *aq, bool module_locked)
+{
+  GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
+	     (aq ? aq->id : 0));
+  GCN_DEBUG ("GCN launch attribs: gdims:[");
+  int i;
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	DEBUG_PRINT (", ");
+      DEBUG_PRINT ("%u", kla->gdims[i]);
+    }
+  DEBUG_PRINT ("], normalized gdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	DEBUG_PRINT (", ");
+      DEBUG_PRINT ("%u", kla->gdims[i] / kla->wdims[i]);
+    }
+  DEBUG_PRINT ("], wdims:[");
+  for (i = 0; i < kla->ndim; ++i)
+    {
+      if (i)
+	DEBUG_PRINT (", ");
+      DEBUG_PRINT ("%u", kla->wdims[i]);
+    }
+  DEBUG_PRINT ("]\n");
+  DEBUG_FLUSH ();
+
+  struct agent_info *agent = kernel->agent;
+  if (!module_locked && pthread_rwlock_rdlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to read-lock a GCN agent rwlock");
+
+  if (!agent->initialized)
+    GOMP_PLUGIN_fatal ("Agent must be initialized");
+
+  if (!kernel->initialized)
+    GOMP_PLUGIN_fatal ("Called kernel must be initialized");
+
+  hsa_queue_t *command_q = (aq ? aq->hsa_queue : kernel->agent->sync_queue);
+
+  uint64_t index
+    = hsa_fns.hsa_queue_add_write_index_release_fn (command_q, 1);
+  GCN_DEBUG ("Got AQL index %llu\n", (long long int) index);
+
+  /* Wait until the queue is not full before writing the packet.   */
+  while (index - hsa_fns.hsa_queue_load_read_index_acquire_fn (command_q)
+	 >= command_q->size)
+    ;
+
+  /* Do not allow the dimensions to be overridden when running
+     constructors or destructors.  */
+  int override_x = kernel->kind == KIND_UNKNOWN ? 0 : override_x_dim;
+  int override_z = kernel->kind == KIND_UNKNOWN ? 0 : override_z_dim;
+
+  hsa_kernel_dispatch_packet_t *packet;
+  packet = ((hsa_kernel_dispatch_packet_t *) command_q->base_address)
+	   + index % command_q->size;
+
+  memset (((uint8_t *) packet) + 4, 0, sizeof (*packet) - 4);
+  packet->grid_size_x = override_x ? : kla->gdims[0];
+  packet->workgroup_size_x = get_group_size (kla->ndim,
+					     packet->grid_size_x,
+					     kla->wdims[0]);
+
+  if (kla->ndim >= 2)
+    {
+      packet->grid_size_y = kla->gdims[1];
+      packet->workgroup_size_y = get_group_size (kla->ndim, kla->gdims[1],
+						 kla->wdims[1]);
+    }
+  else
+    {
+      packet->grid_size_y = 1;
+      packet->workgroup_size_y = 1;
+    }
+
+  if (kla->ndim == 3)
+    {
+      packet->grid_size_z = limit_worker_threads (override_z
+						  ? : kla->gdims[2]);
+      packet->workgroup_size_z = get_group_size (kla->ndim,
+						 packet->grid_size_z,
+						 kla->wdims[2]);
+    }
+  else
+    {
+      packet->grid_size_z = 1;
+      packet->workgroup_size_z = 1;
+    }
+
+  GCN_DEBUG ("GCN launch actuals: grid:[%u, %u, %u],"
+	     " normalized grid:[%u, %u, %u], workgroup:[%u, %u, %u]\n",
+	     packet->grid_size_x, packet->grid_size_y, packet->grid_size_z,
+	     packet->grid_size_x / packet->workgroup_size_x,
+	     packet->grid_size_y / packet->workgroup_size_y,
+	     packet->grid_size_z / packet->workgroup_size_z,
+	     packet->workgroup_size_x, packet->workgroup_size_y,
+	     packet->workgroup_size_z);
+
+  struct kernel_dispatch *shadow
+    = create_kernel_dispatch (kernel, packet->grid_size_x);
+  shadow->queue = command_q;
+
+  if (debug)
+    {
+      fprintf (stderr, "\nKernel has following dependencies:\n");
+      print_kernel_dispatch (shadow, 2);
+    }
+
+  packet->private_segment_size = kernel->private_segment_size;
+  packet->group_segment_size = kernel->group_segment_size;
+  packet->kernel_object = kernel->object;
+  packet->kernarg_address = shadow->kernarg_address;
+  hsa_signal_t s;
+  s.handle = shadow->signal;
+  packet->completion_signal = s;
+  hsa_fns.hsa_signal_store_relaxed_fn (s, 1);
+  memcpy (shadow->kernarg_address, &vars, sizeof (vars));
+
+  GCN_DEBUG ("Copying kernel runtime pointer to kernarg_address\n");
+
+  uint16_t header;
+  header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
+  header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
+
+  GCN_DEBUG ("Going to dispatch kernel %s on device %d\n", kernel->name,
+	     agent->device_id);
+
+  packet_store_release ((uint32_t *) packet, header,
+			(uint16_t) kla->ndim
+			<< HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS);
+
+  hsa_fns.hsa_signal_store_release_fn (command_q->doorbell_signal,
+				       index);
+
+  GCN_DEBUG ("Kernel dispatched, waiting for completion\n");
+
+  /* Root signal waits with 1ms timeout.  */
+  while (hsa_fns.hsa_signal_wait_acquire_fn (s, HSA_SIGNAL_CONDITION_LT, 1,
+					     1000 * 1000,
+					     HSA_WAIT_STATE_BLOCKED) != 0)
+    {
+      console_output (kernel, shadow->kernarg_address, false);
+    }
+  console_output (kernel, shadow->kernarg_address, true);
+
+  struct kernargs *kernargs = shadow->kernarg_address;
+  unsigned int return_value = (unsigned int)kernargs->output_data.return_value;
+
+  release_kernel_dispatch (shadow);
+
+  if (!module_locked && pthread_rwlock_unlock (&agent->module_rwlock))
+    GOMP_PLUGIN_fatal ("Unable to unlock a GCN agent rwlock");
+
+  unsigned int upper = (return_value & ~0xffff) >> 16;
+  if (upper == 0xcafe)
+    ; // exit not called, normal termination.
+  else if (upper == 0xffff)
+    ; // exit called.
+  else
+    {
+      GOMP_PLUGIN_error ("Possible kernel exit value corruption, 2 most"
+			 " significant bytes aren't 0xffff or 0xcafe: 0x%x\n",
+			 return_value);
+      abort ();
+    }
+
+  if (upper == 0xffff)
+    {
+      unsigned int signal = (return_value >> 8) & 0xff;
+
+      if (signal == SIGABRT)
+	{
+	  GCN_WARNING ("GCN Kernel aborted\n");
+	  abort ();
+	}
+      else if (signal != 0)
+	{
+	  GCN_WARNING ("GCN Kernel received unknown signal\n");
+	  abort ();
+	}
+
+      GCN_DEBUG ("GCN Kernel exited with value: %d\n", return_value & 0xff);
+      exit (return_value & 0xff);
+    }
+}
+
+/* }}}  */
+/* {{{ Load/Unload  */
+
+/* Initialize KERNEL from D and other parameters.  Return true on success. */
+
+static bool
+init_basic_kernel_info (struct kernel_info *kernel,
+			struct hsa_kernel_description *d,
+			struct agent_info *agent,
+			struct module_info *module)
+{
+  kernel->agent = agent;
+  kernel->module = module;
+  kernel->name = d->name;
+  if (pthread_mutex_init (&kernel->init_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");
+      return false;
+    }
+  return true;
+}
+
+/* Find the load_offset for MODULE, save to *LOAD_OFFSET, and return true.  If
+   not found, return false.  */
+
+static bool
+find_load_offset (Elf64_Addr *load_offset, struct agent_info *agent,
+		  struct module_info *module, Elf64_Ehdr *image,
+		  Elf64_Shdr *sections)
+{
+  bool res = false;
+
+  hsa_status_t status;
+
+  hsa_executable_symbol_t symbol;
+  if (!find_executable_symbol (agent->executable, &symbol))
+    return false;
+
+  status = hsa_fns.hsa_executable_symbol_get_info_fn
+    (symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, load_offset);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not extract symbol address", status);
+      return false;
+    }
+
+  char *symbol_name = get_executable_symbol_name (symbol);
+  if (symbol_name == NULL)
+    return false;
+
+  /* Find the kernel function in ELF, and calculate actual load offset.  */
+  for (int i = 0; i < image->e_shnum; i++)
+    if (sections[i].sh_type == SHT_SYMTAB)
+      {
+	Elf64_Shdr *strtab = &sections[sections[i].sh_link];
+	char *strings = (char *)image + strtab->sh_offset;
+
+	for (size_t offset = 0;
+	     offset < sections[i].sh_size;
+	     offset += sections[i].sh_entsize)
+	  {
+	    Elf64_Sym *sym = (Elf64_Sym*)((char*)image
+					  + sections[i].sh_offset
+					  + offset);
+	    if (strcmp (symbol_name, strings + sym->st_name) == 0)
+	      {
+		*load_offset -= sym->st_value;
+		res = true;
+		break;
+	      }
+	  }
+      }
+
+  free (symbol_name);
+  return res;
+}
+
+/* Create and finalize the program consisting of all loaded modules.  */
+
+static bool
+create_and_finalize_hsa_program (struct agent_info *agent)
+{
+  hsa_status_t status;
+  int reloc_count = 0;
+  bool res = true;
+  if (pthread_mutex_lock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+      return false;
+    }
+  if (agent->prog_finalized)
+    goto final;
+
+  status
+    = hsa_fns.hsa_executable_create_fn (HSA_PROFILE_FULL,
+					HSA_EXECUTABLE_STATE_UNFROZEN,
+					"", &agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not create GCN executable", status);
+      goto fail;
+    }
+
+  /* Load any GCN modules.  */
+  struct module_info *module = agent->module;
+  if (module)
+    {
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+
+      /* Hide relocations from the HSA runtime loader.
+	 Keep a copy of the unmodified section headers to use later.  */
+      Elf64_Shdr *image_sections = (Elf64_Shdr *)((char *)image
+						  + image->e_shoff);
+      for (int i = image->e_shnum - 1; i >= 0; i--)
+	{
+	  if (image_sections[i].sh_type == SHT_RELA
+	      || image_sections[i].sh_type == SHT_REL)
+	    /* Change section type to something harmless.  */
+	    image_sections[i].sh_type |= 0x80;
+	}
+
+      hsa_code_object_t co = { 0 };
+      status = hsa_fns.hsa_code_object_deserialize_fn
+	(module->image_desc->gcn_image->image,
+	 module->image_desc->gcn_image->size,
+	 NULL, &co);
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not deserialize GCN code object", status);
+	  goto fail;
+	}
+
+      status = hsa_fns.hsa_executable_load_code_object_fn
+	(agent->executable, agent->id, co, "");
+      if (status != HSA_STATUS_SUCCESS)
+	{
+	  hsa_error ("Could not load GCN code object", status);
+	  goto fail;
+	}
+
+      if (!module->heap)
+	{
+	  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
+						   gcn_kernel_heap_size,
+						   (void**)&module->heap);
+	  if (status != HSA_STATUS_SUCCESS)
+	    {
+	      hsa_error ("Could not allocate memory for GCN heap", status);
+	      goto fail;
+	    }
+
+	  status = hsa_fns.hsa_memory_assign_agent_fn
+			(module->heap, agent->id, HSA_ACCESS_PERMISSION_RW);
+	  if (status != HSA_STATUS_SUCCESS)
+	    {
+	      hsa_error ("Could not assign GCN heap memory to device", status);
+	      goto fail;
+	    }
+
+	  hsa_fns.hsa_memory_copy_fn (&module->heap->size,
+				      &gcn_kernel_heap_size,
+				      sizeof (gcn_kernel_heap_size));
+	}
+
+    }
+
+  if (debug)
+    dump_executable_symbols (agent->executable);
+
+  status = hsa_fns.hsa_executable_freeze_fn (agent->executable, "");
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not freeze the GCN executable", status);
+      goto fail;
+    }
+
+  if (agent->module)
+    {
+      struct module_info *module = agent->module;
+      Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image;
+      Elf64_Shdr *sections = (Elf64_Shdr *)((char *)image + image->e_shoff);
+
+      Elf64_Addr load_offset;
+      if (!find_load_offset (&load_offset, agent, module, image, sections))
+	goto fail;
+
+      /* Record the physical load address range.
+	 We need this for data copies later.  */
+      Elf64_Phdr *segments = (Elf64_Phdr *)((char*)image + image->e_phoff);
+      Elf64_Addr low = ~0, high = 0;
+      for (int i = 0; i < image->e_phnum; i++)
+	if (segments[i].p_memsz > 0)
+	  {
+	    if (segments[i].p_paddr < low)
+	      low = segments[i].p_paddr;
+	    if (segments[i].p_paddr > high)
+	      high = segments[i].p_paddr + segments[i].p_memsz - 1;
+	  }
+      module->phys_address_start = low + load_offset;
+      module->phys_address_end = high + load_offset;
+
+      // Find dynamic symbol table
+      Elf64_Shdr *dynsym = NULL;
+      for (int i = 0; i < image->e_shnum; i++)
+	if (sections[i].sh_type == SHT_DYNSYM)
+	  {
+	    dynsym = &sections[i];
+	    break;
+	  }
+
+      /* Fix up relocations.  */
+      for (int i = 0; i < image->e_shnum; i++)
+	{
+	  if (sections[i].sh_type == (SHT_RELA | 0x80))
+	    for (size_t offset = 0;
+		 offset < sections[i].sh_size;
+		 offset += sections[i].sh_entsize)
+	      {
+		Elf64_Rela *reloc = (Elf64_Rela*)((char*)image
+						  + sections[i].sh_offset
+						  + offset);
+		Elf64_Sym *sym =
+		  (dynsym
+		   ? (Elf64_Sym*)((char*)image
+				  + dynsym->sh_offset
+				  + (dynsym->sh_entsize
+				     * ELF64_R_SYM (reloc->r_info)))
+		   : NULL);
+
+		int64_t S = (sym ? sym->st_value : 0);
+		int64_t P = reloc->r_offset + load_offset;
+		int64_t A = reloc->r_addend;
+		int64_t B = load_offset;
+		int64_t V, size;
+		switch (ELF64_R_TYPE (reloc->r_info))
+		  {
+		  case R_AMDGPU_ABS32_LO:
+		    V = (S + A) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS32_HI:
+		    V = (S + A) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_ABS64:
+		    V = S + A;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_REL32:
+		    V = S + A - P;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL64:
+		    /* FIXME
+		       LLD seems to emit REL64 where the the assembler has
+		       ABS64.  This is clearly wrong because it's not what the
+		       compiler is expecting.  Let's assume, for now, that
+		       it's a bug.  In any case, GCN kernels are always self
+		       contained and therefore relative relocations will have
+		       been resolved already, so this should be a safe
+		       workaround.  */
+		    V = S + A/* - P*/;
+		    size = 8;
+		    break;
+		  case R_AMDGPU_ABS32:
+		    V = S + A;
+		    size = 4;
+		    break;
+		    /* TODO R_AMDGPU_GOTPCREL */
+		    /* TODO R_AMDGPU_GOTPCREL32_LO */
+		    /* TODO R_AMDGPU_GOTPCREL32_HI */
+		  case R_AMDGPU_REL32_LO:
+		    V = (S + A - P) & 0xFFFFFFFF;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_REL32_HI:
+		    V = (S + A - P) >> 32;
+		    size = 4;
+		    break;
+		  case R_AMDGPU_RELATIVE64:
+		    V = B + A;
+		    size = 8;
+		    break;
+		  default:
+		    fprintf (stderr, "Error: unsupported relocation type.\n");
+		    exit (1);
+		  }
+		status = hsa_fns.hsa_memory_copy_fn ((void*)P, &V, size);
+		if (status != HSA_STATUS_SUCCESS)
+		  {
+		    hsa_error ("Failed to fix up relocation", status);
+		    goto fail;
+		  }
+		reloc_count++;
+	      }
+	}
+    }
+
+  GCN_DEBUG ("Loaded GCN kernels to device %d (%d relocations)\n",
+	     agent->device_id, reloc_count);
+
+final:
+  agent->prog_finalized = true;
+
+  if (pthread_mutex_unlock (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+      res = false;
+    }
+
+  return res;
+
+fail:
+  res = false;
+  goto final;
+}
+
+/* Free the HSA program in agent and everything associated with it and set
+   agent->prog_finalized and the initialized flags of all kernels to false.
+   Return TRUE on success.  */
+
+static bool
+destroy_hsa_program (struct agent_info *agent)
+{
+  if (!agent->prog_finalized)
+    return true;
+
+  hsa_status_t status;
+
+  GCN_DEBUG ("Destroying the current GCN program.\n");
+
+  status = hsa_fns.hsa_executable_destroy_fn (agent->executable);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Could not destroy GCN executable", status);
+
+  if (agent->module)
+    {
+      int i;
+      for (i = 0; i < agent->module->kernel_count; i++)
+	agent->module->kernels[i].initialized = false;
+
+      if (agent->module->heap)
+	{
+	  hsa_fns.hsa_memory_free_fn (agent->module->heap);
+	  agent->module->heap = NULL;
+	}
+    }
+  agent->prog_finalized = false;
+  return true;
+}
+
+/* Deinitialize all information associated with MODULE and kernels within
+   it.  Return TRUE on success.  */
+
+static bool
+destroy_module (struct module_info *module, bool locked)
+{
+  /* Run destructors before destroying module.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->fini_array_func)
+    {
+      init_kernel (module->fini_array_func);
+      run_kernel (module->fini_array_func, NULL, &kla, NULL, locked);
+    }
+  module->constructors_run_p = false;
+
+  int i;
+  for (i = 0; i < module->kernel_count; i++)
+    if (pthread_mutex_destroy (&module->kernels[i].init_mutex))
+      {
+	GOMP_PLUGIN_error ("Failed to destroy a GCN kernel initialization "
+			   "mutex");
+	return false;
+      }
+
+  return true;
+}
+
+/* }}}  */
+/* {{{ Async  */
+
+/* Callback of dispatch queues to report errors.  */
+
+static void
+execute_queue_entry (struct goacc_asyncqueue *aq, int index)
+{
+  struct queue_entry *entry = &aq->queue[index];
+
+  switch (entry->type)
+    {
+    case KERNEL_LAUNCH:
+      if (DEBUG_QUEUES)
+	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      run_kernel (entry->u.launch.kernel,
+		  entry->u.launch.vars,
+		  &entry->u.launch.kla, aq, false);
+      if (DEBUG_QUEUES)
+	GCN_DEBUG ("Async thread %d:%d: Executing launch entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+      break;
+
+    case CALLBACK:
+      if (DEBUG_QUEUES)
+	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d)\n",
+		   aq->agent->device_id, aq->id, index);
+      entry->u.callback.fn (entry->u.callback.data);
+      if (DEBUG_QUEUES)
+	GCN_DEBUG ("Async thread %d:%d: Executing callback entry (%d) done\n",
+		   aq->agent->device_id, aq->id, index);
+      break;
+
+    case ASYNC_WAIT:
+      {
+	/* FIXME: is it safe to access a placeholder that may already have
+	   been executed?  */
+        struct placeholder *placeholderp = entry->u.asyncwait.placeholderp;
+
+	if (DEBUG_QUEUES)
+          GCN_DEBUG ("Async thread %d:%d: Executing async wait entry (%d)\n",
+		     aq->agent->device_id, aq->id, index);
+
+	pthread_mutex_lock (&placeholderp->mutex);
+
+	while (!placeholderp->executed)
+          pthread_cond_wait (&placeholderp->cond, &placeholderp->mutex);
+
+	pthread_mutex_unlock (&placeholderp->mutex);
+
+	if (pthread_cond_destroy (&placeholderp->cond))
+	  GOMP_PLUGIN_error ("Failed to destroy serialization cond");
+
+	if (pthread_mutex_destroy (&placeholderp->mutex))
+	  GOMP_PLUGIN_error ("Failed to destroy serialization mutex");
+
+	if (DEBUG_QUEUES)
+          GCN_DEBUG ("Async thread %d:%d: Executing async wait "
+		     "entry (%d) done\n", aq->agent->device_id, aq->id, index);
+      }
+      break;
+
+    case ASYNC_PLACEHOLDER:
+      pthread_mutex_lock (&entry->u.placeholder.mutex);
+      entry->u.placeholder.executed = 1;
+      pthread_cond_signal (&entry->u.placeholder.cond);
+      pthread_mutex_unlock (&entry->u.placeholder.mutex);
+      break;
+
+    default:
+      GOMP_PLUGIN_fatal ("Unknown queue element");
+    }
+}
+
+/* This function is run as a thread to service an async queue in the
+   background.  It runs continuously until the stop flag is set.  */
+
+static void *
+drain_queue (void *thread_arg)
+{
+  struct goacc_asyncqueue *aq = thread_arg;
+
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      aq->drain_queue_stop = 2;
+      return NULL;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (true)
+    {
+      if (aq->drain_queue_stop)
+	break;
+
+      if (aq->queue_n > 0)
+	{
+	  pthread_mutex_unlock (&aq->mutex);
+	  execute_queue_entry (aq, aq->queue_first);
+
+	  pthread_mutex_lock (&aq->mutex);
+	  aq->queue_first = ((aq->queue_first + 1)
+			     % ASYNC_QUEUE_SIZE);
+	  aq->queue_n--;
+
+	  if (DEBUG_THREAD_SIGNAL)
+	    GCN_DEBUG ("Async thread %d:%d: broadcasting queue out update\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_broadcast (&aq->queue_cond_out);
+	  pthread_mutex_unlock (&aq->mutex);
+
+	  if (DEBUG_QUEUES)
+	    GCN_DEBUG ("Async thread %d:%d: continue\n", aq->agent->device_id,
+		       aq->id);
+	  pthread_mutex_lock (&aq->mutex);
+	}
+      else
+	{
+	  if (DEBUG_THREAD_SLEEP)
+	    GCN_DEBUG ("Async thread %d:%d: going to sleep\n",
+		       aq->agent->device_id, aq->id);
+	  pthread_cond_wait (&aq->queue_cond_in, &aq->mutex);
+	  if (DEBUG_THREAD_SLEEP)
+	    GCN_DEBUG ("Async thread %d:%d: woke up, rechecking\n",
+		       aq->agent->device_id, aq->id);
+	}
+    }
+
+  aq->drain_queue_stop = 2;
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("Async thread %d:%d: broadcasting last queue out update\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_broadcast (&aq->queue_cond_out);
+  pthread_mutex_unlock (&aq->mutex);
+
+  GCN_DEBUG ("Async thread %d:%d: returning\n", aq->agent->device_id, aq->id);
+  return NULL;
+}
+
+/* This function is used only when DRAIN_QUEUE_SYNCHRONOUS_P is set, which
+   is not usually the case.  This is just a debug tool.  */
+
+static void
+drain_queue_synchronous (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      execute_queue_entry (aq, aq->queue_first);
+
+      aq->queue_first = ((aq->queue_first + 1)
+			 % ASYNC_QUEUE_SIZE);
+      aq->queue_n--;
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Block the current thread until an async queue is writable.  */
+
+static void
+wait_for_queue_nonfull (struct goacc_asyncqueue *aq)
+{
+  if (aq->queue_n == ASYNC_QUEUE_SIZE)
+    {
+      pthread_mutex_lock (&aq->mutex);
+
+      /* Queue is full.  Wait for it to not be full.  */
+      while (aq->queue_n == ASYNC_QUEUE_SIZE)
+	pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+
+      pthread_mutex_unlock (&aq->mutex);
+    }
+}
+
+/* Request an asynchronous kernel launch on the specified queue.  This
+   may block if the queue is full, but returns without waiting for the
+   kernel to run.  */
+
+static void
+queue_push_launch (struct goacc_asyncqueue *aq, struct kernel_info *kernel,
+		   void *vars, struct GOMP_kernel_launch_attributes *kla)
+{
+  assert (aq->agent == kernel->agent);
+
+  wait_for_queue_nonfull (aq);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    GCN_DEBUG ("queue_push_launch %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = KERNEL_LAUNCH;
+  aq->queue[queue_last].u.launch.kernel = kernel;
+  aq->queue[queue_last].u.launch.vars = vars;
+  aq->queue[queue_last].u.launch.kla = *kla;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Request an asynchronous callback on the specified queue.  The callback
+   function will be called, with the given opaque data, from the appropriate
+   async thread, when all previous items on that queue are complete.  */
+
+static void
+queue_push_callback (struct goacc_asyncqueue *aq, void (*fn)(void *),
+		     void *data)
+{
+  wait_for_queue_nonfull (aq);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n)
+		    % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    GCN_DEBUG ("queue_push_callback %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = CALLBACK;
+  aq->queue[queue_last].u.callback.fn = fn;
+  aq->queue[queue_last].u.callback.data = data;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Request that a given async thread wait for another thread (unspecified) to
+   reach the given placeholder.  The wait will occur when all previous entries
+   on the queue are complete.  A placeholder is effectively a kind of signal
+   which simply sets a flag when encountered in a queue.  */
+
+static void
+queue_push_asyncwait (struct goacc_asyncqueue *aq,
+		      struct placeholder *placeholderp)
+{
+  wait_for_queue_nonfull (aq);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    GCN_DEBUG ("queue_push_asyncwait %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = ASYNC_WAIT;
+  aq->queue[queue_last].u.asyncwait.placeholderp = placeholderp;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+}
+
+/* Add a placeholder into an async queue.  When the async thread reaches the
+   placeholder it will set the "executed" flag to true and continue.
+   Another thread may be waiting on this thread reaching the placeholder.  */
+
+static struct placeholder *
+queue_push_placeholder (struct goacc_asyncqueue *aq)
+{
+  struct placeholder *placeholderp;
+
+  wait_for_queue_nonfull (aq);
+
+  pthread_mutex_lock (&aq->mutex);
+
+  int queue_last = ((aq->queue_first + aq->queue_n) % ASYNC_QUEUE_SIZE);
+  if (DEBUG_QUEUES)
+    GCN_DEBUG ("queue_push_placeholder %d:%d: at %i\n", aq->agent->device_id,
+	       aq->id, queue_last);
+
+  aq->queue[queue_last].type = ASYNC_PLACEHOLDER;
+  placeholderp = &aq->queue[queue_last].u.placeholder;
+
+  if (pthread_mutex_init (&placeholderp->mutex, NULL))
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      GOMP_PLUGIN_error ("Failed to initialize serialization mutex");
+    }
+
+  if (pthread_cond_init (&placeholderp->cond, NULL))
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      GOMP_PLUGIN_error ("Failed to initialize serialization cond");
+    }
+
+  placeholderp->executed = 0;
+
+  aq->queue_n++;
+
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  pthread_mutex_unlock (&aq->mutex);
+
+  return placeholderp;
+}
+
+/* Signal an asynchronous thread to terminate, and wait for it to do so.  */
+
+static void
+finalize_async_thread (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  if (aq->drain_queue_stop == 2)
+    {
+      pthread_mutex_unlock (&aq->mutex);
+      return;
+    }
+
+  aq->drain_queue_stop = 1;
+
+  if (DEBUG_THREAD_SIGNAL)
+    GCN_DEBUG ("Signalling async thread %d:%d: cond_in\n",
+	       aq->agent->device_id, aq->id);
+  pthread_cond_signal (&aq->queue_cond_in);
+
+  while (aq->drain_queue_stop != 2)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	GCN_DEBUG ("Waiting for async thread %d:%d to finish, putting thread"
+		   " to sleep\n", aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	GCN_DEBUG ("Waiting, woke up thread %d:%d.  Rechecking\n",
+		   aq->agent->device_id, aq->id);
+    }
+
+  GCN_DEBUG ("Done waiting for async thread %d:%d\n", aq->agent->device_id,
+	     aq->id);
+  pthread_mutex_unlock (&aq->mutex);
+
+  int err = pthread_join (aq->thread_drain_queue, NULL);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("Join async thread %d:%d: failed: %s",
+		       aq->agent->device_id, aq->id, strerror (err));
+  GCN_DEBUG ("Joined with async thread %d:%d\n", aq->agent->device_id, aq->id);
+}
+
+/* Set up an async queue for OpenMP.  There will be only one.  The
+   implementation simply uses an OpenACC async queue.
+   FIXME: is this thread-safe if two threads call this function?  */
+
+static void
+maybe_init_omp_async (struct agent_info *agent)
+{
+  if (!agent->omp_async_queue)
+    agent->omp_async_queue
+      = GOMP_OFFLOAD_openacc_async_construct (agent->device_id);
+}
+
+/* Copy data to or from a device.  This is intended for use as an async
+   callback event.  */
+
+static void
+copy_data (void *data_)
+{
+  struct copy_data *data = (struct copy_data *)data_;
+  GCN_DEBUG ("Async thread %d:%d: Copying %zu bytes from (%p) to (%p)\n",
+	     data->aq->agent->device_id, data->aq->id, data->len, data->src,
+	     data->dst);
+  hsa_fns.hsa_memory_copy_fn (data->dst, data->src, data->len);
+  if (data->free_src)
+    free ((void *) data->src);
+  free (data);
+}
+
+/* Free device data.  This is intended for use as an async callback event.  */
+
+static void
+gomp_offload_free (void *ptr)
+{
+  GCN_DEBUG ("Async thread ?:?: Freeing %p\n", ptr);
+  GOMP_OFFLOAD_free (0, ptr);
+}
+
+/* Request an asynchronous data copy, to or from a device, on a given queue.
+   The event will be registered as a callback.  If FREE_SRC is true
+   then the source data will be freed following the copy.  */
+
+static void
+queue_push_copy (struct goacc_asyncqueue *aq, void *dst, const void *src,
+		 size_t len, bool free_src)
+{
+  if (DEBUG_QUEUES)
+    GCN_DEBUG ("queue_push_copy %d:%d: %zu bytes from (%p) to (%p)\n",
+	       aq->agent->device_id, aq->id, len, src, dst);
+  struct copy_data *data
+    = (struct copy_data *)GOMP_PLUGIN_malloc (sizeof (struct copy_data));
+  data->dst = dst;
+  data->src = src;
+  data->len = len;
+  data->free_src = free_src;
+  data->aq = aq;
+  queue_push_callback (aq, copy_data, data);
+}
+
+/* Return true if the given queue is currently empty.  */
+
+static int
+queue_empty (struct goacc_asyncqueue *aq)
+{
+  pthread_mutex_lock (&aq->mutex);
+  int res = aq->queue_n == 0 ? 1 : 0;
+  pthread_mutex_unlock (&aq->mutex);
+
+  return res;
+}
+
+/* Wait for a given queue to become empty.  This implements an OpenACC wait
+   directive.  */
+
+static void
+wait_queue (struct goacc_asyncqueue *aq)
+{
+  if (DRAIN_QUEUE_SYNCHRONOUS_P)
+    {
+      drain_queue_synchronous (aq);
+      return;
+    }
+
+  pthread_mutex_lock (&aq->mutex);
+
+  while (aq->queue_n > 0)
+    {
+      if (DEBUG_THREAD_SLEEP)
+	GCN_DEBUG ("waiting for thread %d:%d, putting thread to sleep\n",
+		   aq->agent->device_id, aq->id);
+      pthread_cond_wait (&aq->queue_cond_out, &aq->mutex);
+      if (DEBUG_THREAD_SLEEP)
+	GCN_DEBUG ("thread %d:%d woke up.  Rechecking\n", aq->agent->device_id,
+		   aq->id);
+    }
+
+  pthread_mutex_unlock (&aq->mutex);
+  GCN_DEBUG ("waiting for thread %d:%d, done\n", aq->agent->device_id, aq->id);
+}
+
+/* }}}  */
+/* {{{ OpenACC support  */
+
+/* Execute an OpenACC kernel, synchronously or asynchronously.  */
+
+static void
+gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs,
+	  void **devaddrs, unsigned *dims, void *targ_mem_desc, bool async,
+	  struct goacc_asyncqueue *aq)
+{
+  if (!GOMP_OFFLOAD_can_run (kernel))
+    GOMP_PLUGIN_fatal ("OpenACC host fallback unimplemented.");
+
+  /* If we get here then this must be an OpenACC kernel.  */
+  kernel->kind = KIND_OPENACC;
+
+  /* devaddrs must be double-indirect on the target.  */
+  void **ind_da = alloc_by_agent (kernel->agent, sizeof (void*) * mapnum);
+  for (size_t i = 0; i < mapnum; i++)
+    hsa_fns.hsa_memory_copy_fn (&ind_da[i],
+				devaddrs[i] ? &devaddrs[i] : &hostaddrs[i],
+				sizeof (void *));
+
+  struct hsa_kernel_description *hsa_kernel_desc = NULL;
+  for (unsigned i = 0; i < kernel->module->image_desc->kernel_count; i++)
+    {
+      struct hsa_kernel_description *d
+	= &kernel->module->image_desc->kernel_infos[i];
+      if (d->name == kernel->name)
+	{
+	  hsa_kernel_desc = d;
+	  break;
+	}
+    }
+
+  /* We may have statically-determined dimensions in
+     hsa_kernel_desc->oacc_dims[] or dimensions passed to this offload kernel
+     invocation at runtime in dims[].  We allow static dimensions to take
+     priority over dynamic dimensions when present (non-zero).  */
+  if (hsa_kernel_desc->oacc_dims[0] > 0)
+    dims[0] = hsa_kernel_desc->oacc_dims[0];
+  if (hsa_kernel_desc->oacc_dims[1] > 0)
+    dims[1] = hsa_kernel_desc->oacc_dims[1];
+  if (hsa_kernel_desc->oacc_dims[2] > 0)
+    dims[2] = hsa_kernel_desc->oacc_dims[2];
+
+  /* If any of the OpenACC dimensions remain 0 then we get to pick a number.
+     There isn't really a correct answer for this without a clue about the
+     problem size, so let's do a reasonable number of single-worker gangs.
+     64 gangs matches a typical Fiji device.  */
+
+  /* NOTE: Until support for middle-end worker partitioning is merged, use 1
+     for the default number of workers.  */
+  if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
+  if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
+
+  /* The incoming dimensions are expressed in terms of gangs, workers, and
+     vectors.  The HSA dimensions are expressed in terms of "work-items",
+     which means multiples of vector lanes.
+
+     The "grid size" specifies the size of the problem space, and the
+     "work-group size" specifies how much of that we want a single compute
+     unit to chew on at once.
+
+     The three dimensions do not really correspond to hardware, but the
+     important thing is that the HSA runtime will launch as many
+     work-groups as it takes to process the entire grid, and each
+     work-group will contain as many wave-fronts as it takes to process
+     the work-items in that group.
+
+     Essentially, as long as we set the Y dimension to 64 (the number of
+     vector lanes in hardware), and the Z group size to the maximum (16),
+     then we will get the gangs (X) and workers (Z) launched as we expect.
+
+     The reason for the apparent reversal of vector and worker dimension
+     order is to do with the way the run-time distributes work-items across
+     v1 and v2.  */
+  struct GOMP_kernel_launch_attributes kla =
+    {3,
+     /* Grid size.  */
+     {dims[0], 64, dims[1]},
+     /* Work-group size.  */
+     {1,       64, 16}
+    };
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info enqueue_launch_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+	= prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+	= _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+	= acc_construct_parallel;
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name
+	= (char *) kernel->name;
+      enqueue_launch_event_info.launch_event.num_gangs = kla.gdims[0];
+      enqueue_launch_event_info.launch_event.num_workers = kla.gdims[2];
+      enqueue_launch_event_info.launch_event.vector_length = kla.gdims[1];
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+	&enqueue_launch_event_info, api_info);
+    }
+
+  if (!async)
+    {
+      run_kernel (kernel, ind_da, &kla, NULL, false);
+      gomp_offload_free (ind_da);
+    }
+  else
+    {
+      queue_push_launch (aq, kernel, ind_da, &kla);
+      if (DEBUG_QUEUES)
+	GCN_DEBUG ("queue_push_callback %d:%d gomp_offload_free, %p\n",
+		   aq->agent->device_id, aq->id, ind_da);
+      queue_push_callback (aq, gomp_offload_free, ind_da);
+    }
+
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info,
+					    &enqueue_launch_event_info,
+					    api_info);
+    }
+}
+
+/* }}}  */
+/* {{{ Generic Plugin API  */
+
+/* Return the name of the accelerator, which is "gcn".  */
+
+const char *
+GOMP_OFFLOAD_get_name (void)
+{
+  return "gcn";
+}
+
+/* Return the specific capabilities the HSA accelerator have.  */
+
+unsigned int
+GOMP_OFFLOAD_get_caps (void)
+{
+  /* FIXME: Enable shared memory for APU, but not discrete GPU.  */
+  return /*GOMP_OFFLOAD_CAP_SHARED_MEM |*/ GOMP_OFFLOAD_CAP_OPENMP_400
+	    | GOMP_OFFLOAD_CAP_OPENACC_200;
+}
+
+/* Identify as GCN accelerator.  */
+
+int
+GOMP_OFFLOAD_get_type (void)
+{
+  return OFFLOAD_TARGET_TYPE_GCN;
+}
+
+/* Return the libgomp version number we're compatible with.  There is
+   no requirement for cross-version compatibility.  */
+
+unsigned
+GOMP_OFFLOAD_version (void)
+{
+  return GOMP_VERSION;
+}
+
+/* Return the number of GCN devices on the system.  */
+
+int
+GOMP_OFFLOAD_get_num_devices (void)
+{
+  if (!init_hsa_context ())
+    return 0;
+  return hsa_context.agent_count;
+}
+
+/* Initialize device (agent) number N so that it can be used for computation.
+   Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_init_device (int n)
+{
+  if (!init_hsa_context ())
+    return false;
+  if (n >= hsa_context.agent_count)
+    {
+      GOMP_PLUGIN_error ("Request to initialize non-existent GCN device %i", n);
+      return false;
+    }
+  struct agent_info *agent = &hsa_context.agents[n];
+
+  if (agent->initialized)
+    return true;
+
+  agent->device_id = n;
+
+  if (pthread_rwlock_init (&agent->module_rwlock, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent rwlock");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->prog_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->async_queues_mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
+      return false;
+    }
+  agent->async_queues = NULL;
+  agent->omp_async_queue = NULL;
+  agent->team_arena_list = NULL;
+
+  uint32_t queue_size;
+  hsa_status_t status;
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id,
+					  HSA_AGENT_INFO_QUEUE_MAX_SIZE,
+					  &queue_size);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error requesting maximum queue size of the GCN agent",
+		      status);
+
+  char buf[64];
+  status = hsa_fns.hsa_agent_get_info_fn (agent->id, HSA_AGENT_INFO_NAME,
+					  &buf);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error querying the name of the agent", status);
+  agent->gfx900_p = (strncmp (buf, "gfx900", 6) == 0);
+
+  status = hsa_fns.hsa_queue_create_fn (agent->id, queue_size,
+					HSA_QUEUE_TYPE_MULTI,
+					hsa_queue_callback, NULL, UINT32_MAX,
+					UINT32_MAX, &agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error creating command queue", status);
+
+  agent->kernarg_region.handle = (uint64_t) -1;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+						 get_kernarg_memory_region,
+						 &agent->kernarg_region);
+  if (agent->kernarg_region.handle == (uint64_t) -1)
+    {
+      GOMP_PLUGIN_error ("Could not find suitable memory region for kernel "
+			 "arguments");
+      return false;
+    }
+  GCN_DEBUG ("Selected kernel arguments memory region:\n");
+  dump_hsa_region (agent->kernarg_region, NULL);
+
+  agent->data_region.handle = (uint64_t) -1;
+  status = hsa_fns.hsa_agent_iterate_regions_fn (agent->id,
+						 get_data_memory_region,
+						 &agent->data_region);
+  if (agent->data_region.handle == (uint64_t) -1)
+    {
+      GOMP_PLUGIN_error ("Could not find suitable memory region for device "
+			 "data");
+      return false;
+    }
+  GCN_DEBUG ("Selected device data memory region:\n");
+  dump_hsa_region (agent->data_region, NULL);
+
+  GCN_DEBUG ("GCN agent %d initialized\n", n);
+
+  agent->initialized = true;
+  return true;
+}
+
+/* Load GCN object-code module described by struct gcn_image_desc in
+   TARGET_DATA and return references to kernel descriptors in TARGET_TABLE.
+   If there are any constructors then run them.  */
+
+int
+GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
+			 struct addr_pair **target_table)
+{
+  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return -1;
+    }
+
+  struct gcn_image_desc *image_desc = (struct gcn_image_desc *) target_data;
+  struct agent_info *agent;
+  struct addr_pair *pair;
+  struct module_info *module;
+  struct kernel_info *kernel;
+  int kernel_count = image_desc->kernel_count;
+  unsigned var_count = image_desc->global_variable_count;
+
+  agent = get_agent_info (ord);
+  if (!agent)
+    return -1;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return -1;
+    }
+  if (agent->prog_finalized
+      && !destroy_hsa_program (agent))
+    return -1;
+
+  GCN_DEBUG ("Encountered %d kernels in an image\n", kernel_count);
+  GCN_DEBUG ("Encountered %u global variables in an image\n", var_count);
+  pair = GOMP_PLUGIN_malloc ((kernel_count + var_count - 2)
+			     * sizeof (struct addr_pair));
+  *target_table = pair;
+  module = (struct module_info *)
+    GOMP_PLUGIN_malloc_cleared (sizeof (struct module_info)
+				+ kernel_count * sizeof (struct kernel_info));
+  module->image_desc = image_desc;
+  module->kernel_count = kernel_count;
+  module->heap = NULL;
+  module->constructors_run_p = false;
+
+  kernel = &module->kernels[0];
+
+  /* Allocate memory for kernel dependencies.  */
+  for (unsigned i = 0; i < kernel_count; i++)
+    {
+      struct hsa_kernel_description *d = &image_desc->kernel_infos[i];
+      if (!init_basic_kernel_info (kernel, d, agent, module))
+	return -1;
+      if (strcmp (d->name, "_init_array") == 0)
+	module->init_array_func = kernel;
+      else if (strcmp (d->name, "_fini_array") == 0)
+        module->fini_array_func = kernel;
+      else
+	{
+	  pair->start = (uintptr_t) kernel;
+	  pair->end = (uintptr_t) (kernel + 1);
+	  pair++;
+	}
+      kernel++;
+    }
+
+  agent->module = module;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return -1;
+    }
+
+  if (!create_and_finalize_hsa_program (agent))
+    return -1;
+
+  for (unsigned i = 0; i < var_count; i++)
+    {
+      struct global_var_info *v = &image_desc->global_variables[i];
+      GCN_DEBUG ("Looking for variable %s\n", v->name);
+
+      hsa_status_t status;
+      hsa_executable_symbol_t var_symbol;
+      status = hsa_fns.hsa_executable_get_symbol_fn (agent->executable, NULL,
+						     v->name, agent->id,
+						     0, &var_symbol);
+
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not find symbol for variable in the code object",
+		   status);
+
+      uint64_t var_addr;
+      uint32_t var_size;
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &var_addr);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable from its symbol", status);
+      status = hsa_fns.hsa_executable_symbol_get_info_fn
+	(var_symbol, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &var_size);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not extract a variable size from its symbol", status);
+
+      pair->start = var_addr;
+      pair->end = var_addr + var_size;
+      GCN_DEBUG ("Found variable %s at %p with size %u\n", v->name,
+		 (void *)var_addr, var_size);
+      pair++;
+    }
+
+  /* Ensure that constructors are run first.  */
+  struct GOMP_kernel_launch_attributes kla =
+    { 3,
+      /* Grid size.  */
+      { 1, 64, 1 },
+      /* Work-group size.  */
+      { 1, 64, 1 }
+    };
+
+  if (module->init_array_func)
+    {
+      init_kernel (module->init_array_func);
+      run_kernel (module->init_array_func, NULL, &kla, NULL, false);
+    }
+  module->constructors_run_p = true;
+
+  /* Don't report kernels that libgomp need not know about.  */
+  if (module->init_array_func)
+    kernel_count--;
+  if (module->fini_array_func)
+    kernel_count--;
+
+  return kernel_count + var_count;
+}
+
+/* Unload GCN object-code module described by struct gcn_image_desc in
+   TARGET_DATA from agent number N.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_unload_image (int n, unsigned version, const void *target_data)
+{
+  if (GOMP_VERSION_DEV (version) != GOMP_VERSION_GCN)
+    {
+      GOMP_PLUGIN_error ("Offload data incompatible with GCN plugin"
+			 " (expected %u, received %u)",
+			 GOMP_VERSION_GCN, GOMP_VERSION_DEV (version));
+      return false;
+    }
+
+  struct agent_info *agent;
+  agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (pthread_rwlock_wrlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to write-lock a GCN agent rwlock");
+      return false;
+    }
+
+  if (!agent->module || agent->module->image_desc != target_data)
+    {
+      GOMP_PLUGIN_error ("Attempt to unload an image that has never been "
+			 "loaded before");
+      return false;
+    }
+
+  if (!destroy_module (agent->module, true))
+    return false;
+  free (agent->module);
+  agent->module = NULL;
+  if (!destroy_hsa_program (agent))
+    return false;
+  if (pthread_rwlock_unlock (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Unable to unlock a GCN agent rwlock");
+      return false;
+    }
+  return true;
+}
+
+/* Deinitialize all information and status associated with agent number N.  We
+   do not attempt any synchronization, assuming the user and libgomp will not
+   attempt deinitialization of a device that is in any way being used at the
+   same time.  Return TRUE on success.  */
+
+bool
+GOMP_OFFLOAD_fini_device (int n)
+{
+  struct agent_info *agent = get_agent_info (n);
+  if (!agent)
+    return false;
+
+  if (!agent->initialized)
+    return true;
+
+  if (agent->omp_async_queue)
+    {
+      GOMP_OFFLOAD_openacc_async_destruct (agent->omp_async_queue);
+      agent->omp_async_queue = NULL;
+    }
+
+  if (agent->module)
+    {
+      if (!destroy_module (agent->module, false))
+	return false;
+      free (agent->module);
+      agent->module = NULL;
+    }
+
+  if (!destroy_team_arenas (agent))
+    return false;
+
+  if (!destroy_hsa_program (agent))
+    return false;
+
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (agent->sync_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    return hsa_error ("Error destroying command queue", status);
+
+  if (pthread_mutex_destroy (&agent->prog_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent program mutex");
+      return false;
+    }
+  if (pthread_rwlock_destroy (&agent->module_rwlock))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent rwlock");
+      return false;
+    }
+
+  if (pthread_mutex_destroy (&agent->async_queues_mutex))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
+      return false;
+    }
+  if (pthread_mutex_destroy (&agent->team_arena_write_lock))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+      return false;
+    }
+  agent->initialized = false;
+  return true;
+}
+
+/* Return true if the HSA runtime can run function FN_PTR.  */
+
+bool
+GOMP_OFFLOAD_can_run (void *fn_ptr)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  init_kernel (kernel);
+  if (kernel->initialization_failed)
+    goto failure;
+
+  return true;
+
+failure:
+  if (suppress_host_fallback)
+    GOMP_PLUGIN_fatal ("GCN host fallback has been suppressed");
+  GCN_WARNING ("GCN target cannot be launched, doing a host fallback\n");
+  return false;
+}
+
+/* Allocate memory on device N.  */
+
+void *
+GOMP_OFFLOAD_alloc (int n, size_t size)
+{
+  struct agent_info *agent = get_agent_info (n);
+  return alloc_by_agent (agent, size);
+}
+
+/* Free memory from device N.  */
+
+bool
+GOMP_OFFLOAD_free (int device, void *ptr)
+{
+  GCN_DEBUG ("Freeing memory on device %d\n", device);
+
+  hsa_status_t status = hsa_fns.hsa_memory_free_fn (ptr);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Could not free device memory", status);
+      return false;
+    }
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_free;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+	= _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+	= acc_construct_parallel;
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL;
+      data_event_info.data_event.bytes = 0;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) ptr;
+
+      api_info->device_api = acc_device_api_other;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+					    api_info);
+    }
+
+  return true;
+}
+
+/* Copy data from DEVICE to host.  */
+
+bool
+GOMP_OFFLOAD_dev2host (int device, void *dst, const void *src, size_t n)
+{
+  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to host (%p)\n", n, device,
+	     src, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+/* Copy data from host to DEVICE.  */
+
+bool
+GOMP_OFFLOAD_host2dev (int device, void *dst, const void *src, size_t n)
+{
+  GCN_DEBUG ("Copying %zu bytes from host (%p) to device %d (%p)\n", n, src,
+	     device, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+/* Copy data within DEVICE.  Do the copy asynchronously, if appropriate.  */
+
+bool
+GOMP_OFFLOAD_dev2dev (int device, void *dst, const void *src, size_t n)
+{
+  struct gcn_thread *thread_data = gcn_thread ();
+
+  if (thread_data && !async_synchronous_p (thread_data->async))
+    {
+      struct agent_info *agent = get_agent_info (device);
+      maybe_init_omp_async (agent);
+      queue_push_copy (agent->omp_async_queue, dst, src, n, false);
+      return true;
+    }
+
+  GCN_DEBUG ("Copying %zu bytes from device %d (%p) to device %d (%p)\n", n,
+	     device, src, device, dst);
+  hsa_fns.hsa_memory_copy_fn (dst, src, n);
+  return true;
+}
+
+/* }}}  */
+/* {{{ OpenMP Plugin API  */
+
+/* Run a synchronous OpenMP kernel on DEVICE and pass it an array of pointers
+   in VARS as a parameter.  The kernel is identified by FN_PTR which must point
+   to a kernel_info structure, and must have previously been loaded to the
+   specified device.  */
+
+void
+GOMP_OFFLOAD_run (int device, void *fn_ptr, void *vars, void **args)
+{
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  /* If we get here then the kernel must be OpenMP.  */
+  kernel->kind = KIND_OPENMP;
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+  run_kernel (kernel, vars, kla, NULL, false);
+}
+
+/* Run an asynchronous OpenMP kernel on DEVICE.  This is similar to
+   GOMP_OFFLOAD_run except that the launch is queued and there is a call to
+   GOMP_PLUGIN_target_task_completion when it has finished.  */
+
+void
+GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars,
+			void **args, void *async_data)
+{
+  GCN_DEBUG ("GOMP_OFFLOAD_async_run invoked\n");
+  struct agent_info *agent = get_agent_info (device);
+  struct kernel_info *kernel = (struct kernel_info *) tgt_fn;
+  struct GOMP_kernel_launch_attributes def;
+  struct GOMP_kernel_launch_attributes *kla;
+  assert (agent == kernel->agent);
+
+  /* If we get here then the kernel must be OpenMP.  */
+  kernel->kind = KIND_OPENMP;
+
+  if (!parse_target_attributes (args, &def, &kla, agent))
+    {
+      GCN_WARNING ("Will not run GCN kernel because the grid size is zero\n");
+      return;
+    }
+
+  maybe_init_omp_async (agent);
+  queue_push_launch (agent->omp_async_queue, kernel, tgt_vars, kla);
+  queue_push_callback (agent->omp_async_queue,
+		       GOMP_PLUGIN_target_task_completion, async_data);
+}
+
+/* }}} */
+/* {{{ OpenACC Plugin API  */
+
+/* Run a synchronous OpenACC kernel.  The device number is inferred from the
+   already-loaded KERNEL.  */
+
+void
+GOMP_OFFLOAD_openacc_exec (void (*fn_ptr) (void *), size_t mapnum,
+			   void **hostaddrs, void **devaddrs, unsigned *dims,
+			   void *targ_mem_desc)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, false,
+	    NULL);
+}
+
+/* Run an asynchronous OpenACC kernel on the specified queue.  */
+
+void
+GOMP_OFFLOAD_openacc_async_exec (void (*fn_ptr) (void *), size_t mapnum,
+				 void **hostaddrs, void **devaddrs,
+				 unsigned *dims, void *targ_mem_desc,
+				 struct goacc_asyncqueue *aq)
+{
+  struct kernel_info *kernel = (struct kernel_info *) fn_ptr;
+
+  gcn_exec (kernel, mapnum, hostaddrs, devaddrs, dims, targ_mem_desc, true,
+	    aq);
+}
+
+/* Create a new asynchronous thread and queue for running future kernels.  */
+
+struct goacc_asyncqueue *
+GOMP_OFFLOAD_openacc_async_construct (int device)
+{
+  struct agent_info *agent = get_agent_info (device);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  struct goacc_asyncqueue *aq = GOMP_PLUGIN_malloc (sizeof (*aq));
+  aq->agent = get_agent_info (device);
+  aq->prev = NULL;
+  aq->next = agent->async_queues;
+  if (aq->next)
+    {
+      aq->next->prev = aq;
+      aq->id = aq->next->id + 1;
+    }
+  else
+    aq->id = 1;
+  agent->async_queues = aq;
+
+  aq->queue_first = 0;
+  aq->queue_n = 0;
+  aq->drain_queue_stop = 0;
+
+  if (pthread_mutex_init (&aq->mutex, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_in, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+  if (pthread_cond_init (&aq->queue_cond_out, NULL))
+    {
+      GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue cond");
+      return false;
+    }
+
+  hsa_status_t status = hsa_fns.hsa_queue_create_fn (agent->id,
+						     ASYNC_QUEUE_SIZE,
+						     HSA_QUEUE_TYPE_MULTI,
+						     hsa_queue_callback, NULL,
+						     UINT32_MAX, UINT32_MAX,
+						     &aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    hsa_fatal ("Error creating command queue", status);
+
+  int err = pthread_create (&aq->thread_drain_queue, NULL, &drain_queue, aq);
+  if (err != 0)
+    GOMP_PLUGIN_fatal ("GCN asynchronous thread creation failed: %s",
+		       strerror (err));
+  GCN_DEBUG ("Async thread %d:%d: created\n", aq->agent->device_id,
+	     aq->id);
+
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+
+  return aq;
+}
+
+/* Destroy an exisiting asynchronous thread and queue.  Waits for any
+   currently-running task to complete, but cancels any queued tasks.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_destruct (struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = aq->agent;
+
+  finalize_async_thread (aq);
+
+  pthread_mutex_lock (&agent->async_queues_mutex);
+
+  int err;
+  if ((err = pthread_mutex_destroy (&aq->mutex)))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue mutex: %d", err);
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_in))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  if (pthread_cond_destroy (&aq->queue_cond_out))
+    {
+      GOMP_PLUGIN_error ("Failed to destroy a GCN async queue cond");
+      goto fail;
+    }
+  hsa_status_t status = hsa_fns.hsa_queue_destroy_fn (aq->hsa_queue);
+  if (status != HSA_STATUS_SUCCESS)
+    {
+      hsa_error ("Error destroying command queue", status);
+      goto fail;
+    }
+
+  if (aq->prev)
+    aq->prev->next = aq->next;
+  if (aq->next)
+    aq->next->prev = aq->prev;
+  if (agent->async_queues == aq)
+    agent->async_queues = aq->next;
+
+  GCN_DEBUG ("Async thread %d:%d: destroyed\n", agent->device_id, aq->id);
+
+  free (aq);
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return true;
+
+fail:
+  pthread_mutex_unlock (&agent->async_queues_mutex);
+  return false;
+}
+
+/* Return true if the specified async queue is currently empty.  */
+
+int
+GOMP_OFFLOAD_openacc_async_test (struct goacc_asyncqueue *aq)
+{
+  return queue_empty (aq);
+}
+
+/* Block until the specified queue has executed all its tasks and the
+   queue is empty.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_synchronize (struct goacc_asyncqueue *aq)
+{
+  wait_queue (aq);
+  return true;
+}
+
+/* Add a serialization point across two async queues. Any new tasks added to
+   AQ2, after this call, will not run until all tasks on AQ1, at the time
+   of this call, have completed.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_serialize (struct goacc_asyncqueue *aq1,
+				      struct goacc_asyncqueue *aq2)
+{
+  /* For serialize, stream aq2 waits for aq1 to complete work that has been
+     scheduled to run on it up to this point.  */
+  if (aq1 != aq2)
+    {
+      struct placeholder *placeholderp = queue_push_placeholder (aq1);
+      queue_push_asyncwait (aq2, placeholderp);
+    }
+  return true;
+}
+
+/* Add an opaque callback to the given async queue.  */
+
+void
+GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq,
+					   void (*fn) (void *), void *data)
+{
+  queue_push_callback (aq, fn, data);
+}
+
+/* Queue up an asynchronous data copy from host to DEVICE.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+  /* The source data does not necessarily remain live until the deferred
+     copy happens.  Taking a snapshot of the data here avoids reading
+     uninitialised data later, but means that (a) data is copied twice and
+     (b) modifications to the copied data between the "spawning" point of
+     the asynchronous kernel and when it is executed will not be seen.
+     But, that is probably correct.  */
+  void *src_copy = GOMP_PLUGIN_malloc (n);
+  memcpy (src_copy, src, n);
+  queue_push_copy (aq, dst, src_copy, n, true);
+  return true;
+}
+
+/* Queue up an asynchronous data copy from DEVICE to host.  */
+
+bool
+GOMP_OFFLOAD_openacc_async_dev2host (int device, void *dst, const void *src,
+				     size_t n, struct goacc_asyncqueue *aq)
+{
+  struct agent_info *agent = get_agent_info (device);
+  assert (agent == aq->agent);
+  queue_push_copy (aq, dst, src, n, false);
+  return true;
+}
+
+/* Set up plugin-specific thread-local-data (host-side).  */
+
+void *
+GOMP_OFFLOAD_openacc_create_thread_data (int ord __attribute__((unused)))
+{
+  struct gcn_thread *thread_data
+    = GOMP_PLUGIN_malloc (sizeof (struct gcn_thread));
+
+  thread_data->async = GOMP_ASYNC_SYNC;
+
+  return (void *) thread_data;
+}
+
+/* Clean up plugin-specific thread-local-data.  */
+
+void
+GOMP_OFFLOAD_openacc_destroy_thread_data (void *data)
+{
+  free (data);
+}
+
+/* }}} */

[PATCH 7/7 libgomp,amdgcn] GCN Libgomp Plugin

Reply via email to