The AMD GCN runtime support appears to exercise asynchronous operations more heavily than other offload targets. As such, latent problems with asynchronous host-to-device copies have come to light with GCN. This patch provides a solution to those.
Previously posted for the og9 branch (with some rationale) here: https://gcc.gnu.org/ml/gcc-patches/2019-09/msg01026.html This patch implies an ABI change for GOMP_OFFLOAD_openacc_async_host2dev. I'm not sure what our policy is there: do we need to introduce a new "v2" plugin entry point? OK? Julian ChangeLog libgomp/ * libgomp-plugin.h (GOMP_OFFLOAD_openacc_async_host2dev): Update prototype. * libgomp.h (gomp_copy_host2dev): Update prototype. * oacc-host.c (host_openacc_async_host2dev): Add ephemeral parameter. * oacc-mem.c (memcpy_tofrom_device): Update call to gomp_copy_host2dev. (update_dev_host): Likewise. * plugin/plugin-gcn.c (GOMP_OFFLOAD_openacc_async_host2dev): Handle ephemeral host-to-device copies. * plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_host2dev): Add EPHEMERAL parameter, and FIXME function comment. * target.c (goacc_device_copy_async): Remove. (gomp_copy_host2dev): Add ephemeral parameter. Update function comment. Call async host2dev plugin hook directly. (gomp_copy_dev2host): Call async dev2host plugin hook directly. (gomp_map_vars_existing, gomp_map_pointer): Update calls to gomp_copy_host2dev. (gomp_map_vars_internal): Don't use coalescing buffer for asynchronous copies. Update calls to gomp_copy_host2dev. (gomp_update): Update calls to gomp_copy_host2dev. --- libgomp/libgomp-plugin.h | 3 +- libgomp/libgomp.h | 2 +- libgomp/oacc-host.c | 1 + libgomp/oacc-mem.c | 4 +- libgomp/plugin/plugin-gcn.c | 23 +++++---- libgomp/plugin/plugin-nvptx.c | 13 ++++- libgomp/target.c | 92 +++++++++++++++++++---------------- 7 files changed, 82 insertions(+), 56 deletions(-) diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 037558c43f5..200f3b594ee 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -126,7 +126,8 @@ extern void GOMP_OFFLOAD_openacc_async_exec (void (*) (void *), size_t, void **, struct goacc_asyncqueue *); extern bool GOMP_OFFLOAD_openacc_async_dev2host (int, void *, const void *, size_t, struct goacc_asyncqueue *); -extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, size_t, +extern bool GOMP_OFFLOAD_openacc_async_host2dev (int, void *, const void *, + size_t, bool, struct goacc_asyncqueue *); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_device (void); extern void *GOMP_OFFLOAD_openacc_cuda_get_current_context (void); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 7b46e0a494d..65fa390f4a5 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1153,7 +1153,7 @@ extern void gomp_acc_declare_allocate (bool, size_t, void **, size_t *, struct gomp_coalesce_buf; extern void gomp_copy_host2dev (struct gomp_device_descr *, struct goacc_asyncqueue *, void *, const void *, - size_t, struct gomp_coalesce_buf *); + size_t, bool, struct gomp_coalesce_buf *); extern void gomp_copy_dev2host (struct gomp_device_descr *, struct goacc_asyncqueue *, void *, const void *, size_t); diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c index cbcac9bf7b3..bc11770725d 100644 --- a/libgomp/oacc-host.c +++ b/libgomp/oacc-host.c @@ -184,6 +184,7 @@ host_openacc_async_host2dev (int ord __attribute__ ((unused)), void *dst __attribute__ ((unused)), const void *src __attribute__ ((unused)), size_t n __attribute__ ((unused)), + bool eph __attribute__ ((unused)), struct goacc_asyncqueue *aq __attribute__ ((unused))) { diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c index 2f271009fb8..240ebc5c865 100644 --- a/libgomp/oacc-mem.c +++ b/libgomp/oacc-mem.c @@ -205,7 +205,7 @@ memcpy_tofrom_device (bool from, void *d, void *h, size_t s, int async, if (from) gomp_copy_dev2host (thr->dev, aq, h, d, s); else - gomp_copy_host2dev (thr->dev, aq, d, h, s, /* TODO: cbuf? */ NULL); + gomp_copy_host2dev (thr->dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL); if (profiling_p) { @@ -856,7 +856,7 @@ update_dev_host (int is_dev, void *h, size_t s, int async) goacc_aq aq = get_goacc_asyncqueue (async); if (is_dev) - gomp_copy_host2dev (acc_dev, aq, d, h, s, /* TODO: cbuf? */ NULL); + gomp_copy_host2dev (acc_dev, aq, d, h, s, false, /* TODO: cbuf? */ NULL); else gomp_copy_dev2host (acc_dev, aq, h, d, s); diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 583916759a5..7b95a4cef8f 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -3933,19 +3933,22 @@ GOMP_OFFLOAD_openacc_async_queue_callback (struct goacc_asyncqueue *aq, bool GOMP_OFFLOAD_openacc_async_host2dev (int device, void *dst, const void *src, - size_t n, struct goacc_asyncqueue *aq) + size_t n, bool ephemeral, + struct goacc_asyncqueue *aq) { struct agent_info *agent = get_agent_info (device); assert (agent == aq->agent); - /* The source data does not necessarily remain live until the deferred - copy happens. Taking a snapshot of the data here avoids reading - uninitialised data later, but means that (a) data is copied twice and - (b) modifications to the copied data between the "spawning" point of - the asynchronous kernel and when it is executed will not be seen. - But, that is probably correct. */ - void *src_copy = GOMP_PLUGIN_malloc (n); - memcpy (src_copy, src, n); - queue_push_copy (aq, dst, src_copy, n, true); + + if (ephemeral) + { + /* The source data is on the stack or otherwise may be deallocated + before the asynchronous copy takes place. Take a copy of the source + data. */ + void *src_copy = GOMP_PLUGIN_malloc (n); + memcpy (src_copy, src, n); + src = src_copy; + } + queue_push_copy (aq, dst, src, n, ephemeral); return true; } diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 911d0f66a6e..34b25f13dd4 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1711,9 +1711,20 @@ GOMP_OFFLOAD_dev2dev (int ord, void *dst, const void *src, size_t n) return true; } +/* FIXME: It is unknown whether the cuMemcpyHtoDAsync API call caches source + data before the asynchronous copy takes place. Either way there is a data + race associated with ignoring the EPHEMERAL parameter here -- either if it + is TRUE (because we are copying uncached data that may disappear before the + async copy takes place) or if it is FALSE (because the source data may be + cached/snapshotted here before it is modified by an earlier async operation, + so stale data gets copied to the target). + Neither problem has been observed in practice, so far. */ + bool GOMP_OFFLOAD_openacc_async_host2dev (int ord, void *dst, const void *src, - size_t n, struct goacc_asyncqueue *aq) + size_t n, + bool ephemeral __attribute__((unused)), + struct goacc_asyncqueue *aq) { if (!nvptx_attach_host_thread_to_device (ord) || !cuda_memcpy_sanity_check (src, dst, n)) diff --git a/libgomp/target.c b/libgomp/target.c index b6276cade9f..792c70ce331 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -190,22 +190,6 @@ gomp_device_copy (struct gomp_device_descr *devicep, } } -static inline void -goacc_device_copy_async (struct gomp_device_descr *devicep, - bool (*copy_func) (int, void *, const void *, size_t, - struct goacc_asyncqueue *), - const char *dst, void *dstaddr, - const char *src, const void *srcaddr, - size_t size, struct goacc_asyncqueue *aq) -{ - if (!copy_func (devicep->target_id, dstaddr, srcaddr, size, aq)) - { - gomp_mutex_unlock (&devicep->lock); - gomp_fatal ("Copying of %s object [%p..%p) to %s object [%p..%p) failed", - src, srcaddr, srcaddr + size, dst, dstaddr, dstaddr + size); - } -} - /* Infrastructure for coalescing adjacent or nearly adjacent (in device addresses) host to device memory transfers. */ @@ -298,11 +282,18 @@ gomp_to_device_kind_p (int kind) } } +/* Copy host memory to an offload device. In asynchronous mode (if AQ is + non-NULL), when the source data is stack or may otherwise be deallocated + before the asynchronous copy takes place, EPHEMERAL must be passed as + TRUE. The CBUF isn't used for non-ephemeral asynchronous copies, because + the host data might not be computed yet (by an earlier asynchronous compute + region). */ + attribute_hidden void gomp_copy_host2dev (struct gomp_device_descr *devicep, struct goacc_asyncqueue *aq, void *d, const void *h, size_t sz, - struct gomp_coalesce_buf *cbuf) + bool ephemeral, struct gomp_coalesce_buf *cbuf) { if (cbuf) { @@ -330,8 +321,15 @@ gomp_copy_host2dev (struct gomp_device_descr *devicep, } } if (__builtin_expect (aq != NULL, 0)) - goacc_device_copy_async (devicep, devicep->openacc.async.host2dev_func, - "dev", d, "host", h, sz, aq); + { + if (!devicep->openacc.async.host2dev_func (devicep->target_id, d, h, sz, + ephemeral, aq)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("Copying of host object [%p..%p) to dev object [%p..%p) " + "failed", h, h + sz, d, d + sz); + } + } else gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, sz); } @@ -342,8 +340,15 @@ gomp_copy_dev2host (struct gomp_device_descr *devicep, void *h, const void *d, size_t sz) { if (__builtin_expect (aq != NULL, 0)) - goacc_device_copy_async (devicep, devicep->openacc.async.dev2host_func, - "host", h, "dev", d, sz, aq); + { + if (!devicep->openacc.async.dev2host_func (devicep->target_id, h, d, sz, + aq)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("Copying of dev object [%p..%p) to host object [%p..%p) " + "failed", d, d + sz, h, h + sz); + } + } else gomp_device_copy (devicep, devicep->dev2host_func, "host", h, "dev", d, sz); } @@ -390,7 +395,7 @@ gomp_map_vars_existing (struct gomp_device_descr *devicep, (void *) (oldn->tgt->tgt_start + oldn->tgt_offset + newn->host_start - oldn->host_start), (void *) newn->host_start, - newn->host_end - newn->host_start, cbuf); + newn->host_end - newn->host_start, false, cbuf); if (oldn->refcount != REFCOUNT_INFINITY) oldn->refcount++; @@ -418,8 +423,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq, cur_node.tgt_offset = (uintptr_t) NULL; gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset), - (void *) &cur_node.tgt_offset, - sizeof (void *), cbuf); + (void *) &cur_node.tgt_offset, sizeof (void *), + true, cbuf); return; } /* Add bias to the pointer value. */ @@ -439,7 +444,8 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq, to initialize the pointer with. */ cur_node.tgt_offset -= bias; gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + target_offset), - (void *) &cur_node.tgt_offset, sizeof (void *), cbuf); + (void *) &cur_node.tgt_offset, sizeof (void *), true, + cbuf); } static void @@ -656,8 +662,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, for (i = first; i <= last; i++) { tgt->list[i].key = NULL; - if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i) - & typemask)) + if (!aq + && gomp_to_device_kind_p (get_kind (short_mapkind, kinds, + i) & typemask)) gomp_coalesce_buf_add (&cbuf, tgt_size - cur_node.host_end + (uintptr_t) hostaddrs[i], @@ -692,8 +699,9 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, if (tgt_align < align) tgt_align = align; tgt_size = (tgt_size + align - 1) & ~(align - 1); - gomp_coalesce_buf_add (&cbuf, tgt_size, - cur_node.host_end - cur_node.host_start); + if (!aq) + gomp_coalesce_buf_add (&cbuf, tgt_size, + cur_node.host_end - cur_node.host_start); tgt_size += cur_node.host_end - cur_node.host_start; has_firstprivate = true; continue; @@ -723,7 +731,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, if (tgt_align < align) tgt_align = align; tgt_size = (tgt_size + align - 1) & ~(align - 1); - if (gomp_to_device_kind_p (kind & typemask)) + if (!aq && gomp_to_device_kind_p (kind & typemask)) gomp_coalesce_buf_add (&cbuf, tgt_size, cur_node.host_end - cur_node.host_start); tgt_size += cur_node.host_end - cur_node.host_start; @@ -825,7 +833,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, len = sizes[i]; gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + tgt_size), - (void *) hostaddrs[i], len, cbufp); + (void *) hostaddrs[i], len, false, cbufp); tgt_size += len; continue; case GOMP_MAP_FIRSTPRIVATE_INT: @@ -895,12 +903,11 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, if (cur_node.tgt_offset) cur_node.tgt_offset -= sizes[i]; gomp_copy_host2dev (devicep, aq, - (void *) (n->tgt->tgt_start - + n->tgt_offset + (void *) (n->tgt->tgt_start + n->tgt_offset + cur_node.host_start - n->host_start), (void *) &cur_node.tgt_offset, - sizeof (void *), cbufp); + sizeof (void *), true, cbufp); cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset + cur_node.host_start - n->host_start; continue; @@ -972,7 +979,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, - k->host_end - k->host_start, cbufp); + k->host_end - k->host_start, false, + cbufp); break; case GOMP_MAP_POINTER: gomp_map_pointer (tgt, aq, @@ -984,7 +992,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, - k->host_end - k->host_start, cbufp); + k->host_end - k->host_start, false, + cbufp); for (j = i + 1; j < mapnum; j++) if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds, @@ -1035,7 +1044,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, (void *) (tgt->tgt_start + k->tgt_offset), (void *) k->host_start, - sizeof (void *), cbufp); + sizeof (void *), false, cbufp); break; default: gomp_mutex_unlock (&devicep->lock); @@ -1051,7 +1060,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, /* We intentionally do not use coalescing here, as it's not data allocated by the current call to this function. */ gomp_copy_host2dev (devicep, aq, (void *) n->tgt_offset, - &tgt_addr, sizeof (void *), NULL); + &tgt_addr, sizeof (void *), true, NULL); } array++; } @@ -1066,7 +1075,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, gomp_copy_host2dev (devicep, aq, (void *) (tgt->tgt_start + i * sizeof (void *)), (void *) &cur_node.tgt_offset, sizeof (void *), - cbufp); + true, cbufp); } } @@ -1078,7 +1087,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, (void *) (tgt->tgt_start + cbuf.chunks[c].start), (char *) cbuf.buf + (cbuf.chunks[c].start - cbuf.chunks[0].start), - cbuf.chunks[c].end - cbuf.chunks[c].start, NULL); + cbuf.chunks[c].end - cbuf.chunks[c].start, true, + NULL); free (cbuf.buf); cbuf.buf = NULL; cbufp = NULL; @@ -1280,7 +1290,7 @@ gomp_update (struct gomp_device_descr *devicep, size_t mapnum, void **hostaddrs, if (GOMP_MAP_COPY_TO_P (kind & typemask)) gomp_copy_host2dev (devicep, NULL, devaddr, hostaddr, size, - NULL); + false, NULL); if (GOMP_MAP_COPY_FROM_P (kind & typemask)) gomp_copy_dev2host (devicep, NULL, hostaddr, devaddr, size); } -- 2.23.0