Hi Jakub!

While reviewing Chung-Lin's
<https://gcc.gnu.org/ml/gcc-patches/2018-09/msg01428.html> "[PATCH 4/6,
OpenACC, libgomp] Async re-work, libgomp/target.c changes", I noticed the
following unrelated hunk.  Is that intentional or just an oversight that
it hasn't been included in your "gomp_coalesce_buf" changes (quoted below
for reference)?

commit 2abec5454063076ebd0fddf6ed25a3459c4f5ac3
Author: Thomas Schwinge <tho...@codesourcery.com>
Date:   Thu Dec 6 17:52:34 2018 +0100

    Coalesce host to device transfers in libgomp: link pointer
    
            libgomp/
            * target.c (gomp_map_vars): Call "gomp_copy_host2dev" instead of
            "devicep->host2dev_func".
---
 libgomp/target.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git libgomp/target.c libgomp/target.c
index 8ebc2a370a16..9cb2ec8d026f 100644
--- libgomp/target.c
+++ libgomp/target.c
@@ -957,9 +957,9 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t 
mapnum,
                    /* Set link pointer on target to the device address of the
                       mapped object.  */
                    void *tgt_addr = (void *) (tgt->tgt_start + k->tgt_offset);
-                   devicep->host2dev_func (devicep->target_id,
-                                           (void *) n->tgt_offset,
-                                           &tgt_addr, sizeof (void *));
+                   gomp_copy_host2dev (devicep, (void *) n->tgt_offset,
+                                       &tgt_addr, sizeof (void *), cbufp);
+
                  }
                array++;
              }

If approving this patch, please respond with "Reviewed-by: NAME <EMAIL>"
so that your effort will be recorded in the commit log, see
<https://gcc.gnu.org/wiki/Reviewed-by>.


Grüße
 Thomas


On Wed, 25 Oct 2017 13:38:50 +0200, Jakub Jelinek <ja...@redhat.com> wrote:
> Here is an updated patch with some renaming, extra macros, one extra inline
> function and comments.

> 2017-10-25  Jakub Jelinek  <ja...@redhat.com>
> 
>       * target.c (struct gomp_coalesce_buf): New type.
>       (MAX_COALESCE_BUF_SIZE, MAX_COALESCE_BUF_GAP): Define.
>       (gomp_coalesce_buf_add, gomp_to_device_kind_p): New functions.
>       (gomp_copy_host2dev): Add CBUF argument, if copying into
>       the cached ranges, memcpy into buffer instead of copying
>       into device.
>       (gomp_map_vars_existing, gomp_map_pointer, gomp_map_fields_existing):
>       Add CBUF argument, pass it through to other calls.
>       (gomp_map_vars): Aggregate copies from host to device if small enough
>       and with small enough gaps in between into memcpy into a buffer and
>       fewer host to device copies from the buffer.
>       (gomp_update): Adjust gomp_copy_host2dev caller.
> 
> --- libgomp/target.c.jj       2017-10-24 12:07:03.763759657 +0200
> +++ libgomp/target.c  2017-10-25 13:17:31.608975390 +0200
> @@ -177,10 +177,122 @@ gomp_device_copy (struct gomp_device_des
>      }
>  }
>  
> +/* Infrastructure for coalescing adjacent or nearly adjacent (in device 
> addresses)
> +   host to device memory transfers.  */
> +
> +struct gomp_coalesce_buf
> +{
> +  /* Buffer into which gomp_copy_host2dev will memcpy data and from which
> +     it will be copied to the device.  */
> +  void *buf;
> +  struct target_mem_desc *tgt;
> +  /* Array with offsets, chunks[2 * i] is the starting offset and
> +     chunks[2 * i + 1] ending offset relative to tgt->tgt_start device 
> address
> +     of chunks which are to be copied to buf and later copied to device.  */
> +  size_t *chunks;
> +  /* Number of chunks in chunks array, or -1 if coalesce buffering should not
> +     be performed.  */
> +  long chunk_cnt;
> +  /* During construction of chunks array, how many memory regions are within
> +     the last chunk.  If there is just one memory region for a chunk, we copy
> +     it directly to device rather than going through buf.  */
> +  long use_cnt;
> +};
> +
> +/* Maximum size of memory region considered for coalescing.  Larger copies
> +   are performed directly.  */
> +#define MAX_COALESCE_BUF_SIZE        (32 * 1024)
> +
> +/* Maximum size of a gap in between regions to consider them being copied
> +   within the same chunk.  All the device offsets considered are within
> +   newly allocated device memory, so it isn't fatal if we copy some padding
> +   in between from host to device.  The gaps come either from alignment
> +   padding or from memory regions which are not supposed to be copied from
> +   host to device (e.g. map(alloc:), map(from:) etc.).  */
> +#define MAX_COALESCE_BUF_GAP (4 * 1024)
> +
> +/* Add region with device tgt_start relative offset and length to CBUF.  */
> +
> +static inline void
> +gomp_coalesce_buf_add (struct gomp_coalesce_buf *cbuf, size_t start, size_t 
> len)
> +{
> +  if (len > MAX_COALESCE_BUF_SIZE || len == 0)
> +    return;
> +  if (cbuf->chunk_cnt)
> +    {
> +      if (cbuf->chunk_cnt < 0)
> +     return;
> +      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
> +     {
> +       cbuf->chunk_cnt = -1;
> +       return;
> +     }
> +      if (start < cbuf->chunks[2 * cbuf->chunk_cnt - 1] + 
> MAX_COALESCE_BUF_GAP)
> +     {
> +       cbuf->chunks[2 * cbuf->chunk_cnt - 1] = start + len;
> +       cbuf->use_cnt++;
> +       return;
> +     }
> +      /* If the last chunk is only used by one mapping, discard it,
> +      as it will be one host to device copy anyway and
> +      memcpying it around will only waste cycles.  */
> +      if (cbuf->use_cnt == 1)
> +     cbuf->chunk_cnt--;
> +    }
> +  cbuf->chunks[2 * cbuf->chunk_cnt] = start;
> +  cbuf->chunks[2 * cbuf->chunk_cnt + 1] = start + len;
> +  cbuf->chunk_cnt++;
> +  cbuf->use_cnt = 1;
> +}
> +
> +/* Return true for mapping kinds which need to copy data from the
> +   host to device for regions that weren't previously mapped.  */
> +
> +static inline bool
> +gomp_to_device_kind_p (int kind)
> +{
> +  switch (kind)
> +    {
> +    case GOMP_MAP_ALLOC:
> +    case GOMP_MAP_FROM:
> +    case GOMP_MAP_FORCE_ALLOC:
> +    case GOMP_MAP_ALWAYS_FROM:
> +      return false;
> +    default:
> +      return true;
> +    }
> +}
> +
>  static void
>  gomp_copy_host2dev (struct gomp_device_descr *devicep,
> -                 void *d, const void *h, size_t sz)
> +                 void *d, const void *h, size_t sz,
> +                 struct gomp_coalesce_buf *cbuf)
>  {
> +  if (cbuf)
> +    {
> +      uintptr_t doff = (uintptr_t) d - cbuf->tgt->tgt_start;
> +      if (doff < cbuf->chunks[2 * cbuf->chunk_cnt - 1])
> +     {
> +       long first = 0;
> +       long last = cbuf->chunk_cnt - 1;
> +       while (first <= last)
> +         {
> +           long middle = (first + last) >> 1;
> +           if (cbuf->chunks[2 * middle + 1] <= doff)
> +             first = middle + 1;
> +           else if (cbuf->chunks[2 * middle] <= doff)
> +             {
> +               if (doff + sz > cbuf->chunks[2 * middle + 1])
> +                 gomp_fatal ("internal libgomp cbuf error");
> +               memcpy ((char *) cbuf->buf + (doff - cbuf->chunks[0]),
> +                       h, sz);
> +               return;
> +             }
> +           else
> +             last = middle - 1;
> +         }
> +     }
> +    }
>    gomp_device_copy (devicep, devicep->host2dev_func, "dev", d, "host", h, 
> sz);
>  }
>  
> @@ -208,7 +320,7 @@ gomp_free_device_memory (struct gomp_dev
>  static inline void
>  gomp_map_vars_existing (struct gomp_device_descr *devicep, splay_tree_key 
> oldn,
>                       splay_tree_key newn, struct target_var_desc *tgt_var,
> -                     unsigned char kind)
> +                     unsigned char kind, struct gomp_coalesce_buf *cbuf)
>  {
>    tgt_var->key = oldn;
>    tgt_var->copy_from = GOMP_MAP_COPY_FROM_P (kind);
> @@ -232,7 +344,7 @@ gomp_map_vars_existing (struct gomp_devi
>                       (void *) (oldn->tgt->tgt_start + oldn->tgt_offset
>                                 + newn->host_start - oldn->host_start),
>                       (void *) newn->host_start,
> -                     newn->host_end - newn->host_start);
> +                     newn->host_end - newn->host_start, cbuf);
>  
>    if (oldn->refcount != REFCOUNT_INFINITY)
>      oldn->refcount++;
> @@ -247,7 +359,8 @@ get_kind (bool short_mapkind, void *kind
>  
>  static void
>  gomp_map_pointer (struct target_mem_desc *tgt, uintptr_t host_ptr,
> -               uintptr_t target_offset, uintptr_t bias)
> +               uintptr_t target_offset, uintptr_t bias,
> +               struct gomp_coalesce_buf *cbuf)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
>    struct splay_tree_s *mem_map = &devicep->mem_map;
> @@ -257,11 +370,10 @@ gomp_map_pointer (struct target_mem_desc
>    if (cur_node.host_start == (uintptr_t) NULL)
>      {
>        cur_node.tgt_offset = (uintptr_t) NULL;
> -      /* FIXME: see comment about coalescing host/dev transfers below.  */
>        gomp_copy_host2dev (devicep,
>                         (void *) (tgt->tgt_start + target_offset),
>                         (void *) &cur_node.tgt_offset,
> -                       sizeof (void *));
> +                       sizeof (void *), cbuf);
>        return;
>      }
>    /* Add bias to the pointer value.  */
> @@ -280,15 +392,15 @@ gomp_map_pointer (struct target_mem_desc
>       array section.  Now subtract bias to get what we want
>       to initialize the pointer with.  */
>    cur_node.tgt_offset -= bias;
> -  /* FIXME: see comment about coalescing host/dev transfers below.  */
>    gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + target_offset),
> -                   (void *) &cur_node.tgt_offset, sizeof (void *));
> +                   (void *) &cur_node.tgt_offset, sizeof (void *), cbuf);
>  }
>  
>  static void
>  gomp_map_fields_existing (struct target_mem_desc *tgt, splay_tree_key n,
>                         size_t first, size_t i, void **hostaddrs,
> -                       size_t *sizes, void *kinds)
> +                       size_t *sizes, void *kinds,
> +                       struct gomp_coalesce_buf *cbuf)
>  {
>    struct gomp_device_descr *devicep = tgt->device_descr;
>    struct splay_tree_s *mem_map = &devicep->mem_map;
> @@ -306,7 +418,7 @@ gomp_map_fields_existing (struct target_
>        && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>      {
>        gomp_map_vars_existing (devicep, n2, &cur_node,
> -                           &tgt->list[i], kind & typemask);
> +                           &tgt->list[i], kind & typemask, cbuf);
>        return;
>      }
>    if (sizes[i] == 0)
> @@ -322,7 +434,7 @@ gomp_map_fields_existing (struct target_
>                == n2->tgt_offset - n->tgt_offset)
>           {
>             gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> -                                   kind & typemask);
> +                                   kind & typemask, cbuf);
>             return;
>           }
>       }
> @@ -334,7 +446,7 @@ gomp_map_fields_existing (struct target_
>         && n2->host_start - n->host_start == n2->tgt_offset - n->tgt_offset)
>       {
>         gomp_map_vars_existing (devicep, n2, &cur_node, &tgt->list[i],
> -                               kind & typemask);
> +                               kind & typemask, cbuf);
>         return;
>       }
>      }
> @@ -381,6 +493,7 @@ gomp_map_vars (struct gomp_device_descr
>    tgt->list_count = mapnum;
>    tgt->refcount = pragma_kind == GOMP_MAP_VARS_ENTER_DATA ? 0 : 1;
>    tgt->device_descr = devicep;
> +  struct gomp_coalesce_buf cbuf, *cbufp = NULL;
>  
>    if (mapnum == 0)
>      {
> @@ -391,11 +504,25 @@ gomp_map_vars (struct gomp_device_descr
>  
>    tgt_align = sizeof (void *);
>    tgt_size = 0;
> +  cbuf.chunks = NULL;
> +  cbuf.chunk_cnt = -1;
> +  cbuf.use_cnt = 0;
> +  cbuf.buf = NULL;
> +  if (mapnum > 1 || pragma_kind == GOMP_MAP_VARS_TARGET)
> +    {
> +      cbuf.chunks
> +     = (size_t *) gomp_alloca ((2 * mapnum + 2) * sizeof (size_t));
> +      cbuf.chunk_cnt = 0;
> +    }
>    if (pragma_kind == GOMP_MAP_VARS_TARGET)
>      {
>        size_t align = 4 * sizeof (void *);
>        tgt_align = align;
>        tgt_size = mapnum * sizeof (void *);
> +      cbuf.chunk_cnt = 1;
> +      cbuf.use_cnt = 1 + (mapnum > 1);
> +      cbuf.chunks[0] = 0;
> +      cbuf.chunks[1] = tgt_size;
>      }
>  
>    gomp_mutex_lock (&devicep->lock);
> @@ -449,19 +576,26 @@ gomp_map_vars (struct gomp_device_descr
>             size_t align = (size_t) 1 << (kind >> rshift);
>             if (tgt_align < align)
>               tgt_align = align;
> -           tgt_size -= (uintptr_t) hostaddrs[first]
> -                       - (uintptr_t) hostaddrs[i];
> +           tgt_size -= (uintptr_t) hostaddrs[first] - cur_node.host_start;
>             tgt_size = (tgt_size + align - 1) & ~(align - 1);
> -           tgt_size += cur_node.host_end - (uintptr_t) hostaddrs[i];
> +           tgt_size += cur_node.host_end - cur_node.host_start;
>             not_found_cnt += last - i;
>             for (i = first; i <= last; i++)
> -             tgt->list[i].key = NULL;
> +             {
> +               tgt->list[i].key = NULL;
> +               if (gomp_to_device_kind_p (get_kind (short_mapkind, kinds, i)
> +                                          & typemask))
> +                 gomp_coalesce_buf_add (&cbuf,
> +                                        tgt_size - cur_node.host_end
> +                                        + (uintptr_t) hostaddrs[i],
> +                                        sizes[i]);
> +             }
>             i--;
>             continue;
>           }
>         for (i = first; i <= last; i++)
>           gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> -                                   sizes, kinds);
> +                                   sizes, kinds, NULL);
>         i--;
>         continue;
>       }
> @@ -485,6 +619,8 @@ gomp_map_vars (struct gomp_device_descr
>         if (tgt_align < align)
>           tgt_align = align;
>         tgt_size = (tgt_size + align - 1) & ~(align - 1);
> +       gomp_coalesce_buf_add (&cbuf, tgt_size,
> +                              cur_node.host_end - cur_node.host_start);
>         tgt_size += cur_node.host_end - cur_node.host_start;
>         has_firstprivate = true;
>         continue;
> @@ -504,7 +640,7 @@ gomp_map_vars (struct gomp_device_descr
>       n = splay_tree_lookup (mem_map, &cur_node);
>        if (n && n->refcount != REFCOUNT_LINK)
>       gomp_map_vars_existing (devicep, n, &cur_node, &tgt->list[i],
> -                             kind & typemask);
> +                             kind & typemask, NULL);
>        else
>       {
>         tgt->list[i].key = NULL;
> @@ -514,6 +650,9 @@ gomp_map_vars (struct gomp_device_descr
>         if (tgt_align < align)
>           tgt_align = align;
>         tgt_size = (tgt_size + align - 1) & ~(align - 1);
> +       if (gomp_to_device_kind_p (kind & typemask))
> +         gomp_coalesce_buf_add (&cbuf, tgt_size,
> +                                cur_node.host_end - cur_node.host_start);
>         tgt_size += cur_node.host_end - cur_node.host_start;
>         if ((kind & typemask) == GOMP_MAP_TO_PSET)
>           {
> @@ -562,6 +701,19 @@ gomp_map_vars (struct gomp_device_descr
>        tgt->tgt_start = (uintptr_t) tgt->to_free;
>        tgt->tgt_start = (tgt->tgt_start + tgt_align - 1) & ~(tgt_align - 1);
>        tgt->tgt_end = tgt->tgt_start + tgt_size;
> +
> +      if (cbuf.use_cnt == 1)
> +     cbuf.chunk_cnt--;
> +      if (cbuf.chunk_cnt > 0)
> +     {
> +       cbuf.buf
> +         = malloc (cbuf.chunks[2 * cbuf.chunk_cnt - 1] - cbuf.chunks[0]);
> +       if (cbuf.buf)
> +         {
> +           cbuf.tgt = tgt;
> +           cbufp = &cbuf;
> +         }
> +     }
>      }
>    else
>      {
> @@ -600,7 +752,7 @@ gomp_map_vars (struct gomp_device_descr
>               len = sizes[i];
>               gomp_copy_host2dev (devicep,
>                                   (void *) (tgt->tgt_start + tgt_size),
> -                                 (void *) hostaddrs[i], len);
> +                                 (void *) hostaddrs[i], len, cbufp);
>               tgt_size += len;
>               continue;
>             case GOMP_MAP_FIRSTPRIVATE_INT:
> @@ -633,7 +785,7 @@ gomp_map_vars (struct gomp_device_descr
>                 }
>               for (i = first; i <= last; i++)
>                 gomp_map_fields_existing (tgt, n, first, i, hostaddrs,
> -                                         sizes, kinds);
> +                                         sizes, kinds, cbufp);
>               i--;
>               continue;
>             case GOMP_MAP_ALWAYS_POINTER:
> @@ -658,7 +810,7 @@ gomp_map_vars (struct gomp_device_descr
>                                             + cur_node.host_start
>                                             - n->host_start),
>                                   (void *) &cur_node.tgt_offset,
> -                                 sizeof (void *));
> +                                 sizeof (void *), cbufp);
>               cur_node.tgt_offset = n->tgt->tgt_start + n->tgt_offset
>                                     + cur_node.host_start - n->host_start;
>               continue;
> @@ -674,7 +826,7 @@ gomp_map_vars (struct gomp_device_descr
>           splay_tree_key n = splay_tree_lookup (mem_map, k);
>           if (n && n->refcount != REFCOUNT_LINK)
>             gomp_map_vars_existing (devicep, n, k, &tgt->list[i],
> -                                   kind & typemask);
> +                                   kind & typemask, cbufp);
>           else
>             {
>               k->link_key = NULL;
> @@ -725,26 +877,22 @@ gomp_map_vars (struct gomp_device_descr
>                 case GOMP_MAP_FORCE_TOFROM:
>                 case GOMP_MAP_ALWAYS_TO:
>                 case GOMP_MAP_ALWAYS_TOFROM:
> -                 /* FIXME: Perhaps add some smarts, like if copying
> -                    several adjacent fields from host to target, use some
> -                    host buffer to avoid sending each var individually.  */
>                   gomp_copy_host2dev (devicep,
>                                       (void *) (tgt->tgt_start
>                                                 + k->tgt_offset),
>                                       (void *) k->host_start,
> -                                     k->host_end - k->host_start);
> +                                     k->host_end - k->host_start, cbufp);
>                   break;
>                 case GOMP_MAP_POINTER:
>                   gomp_map_pointer (tgt, (uintptr_t) *(void **) k->host_start,
> -                                   k->tgt_offset, sizes[i]);
> +                                   k->tgt_offset, sizes[i], cbufp);
>                   break;
>                 case GOMP_MAP_TO_PSET:
> -                 /* FIXME: see above FIXME comment.  */
>                   gomp_copy_host2dev (devicep,
>                                       (void *) (tgt->tgt_start
>                                                 + k->tgt_offset),
>                                       (void *) k->host_start,
> -                                     k->host_end - k->host_start);
> +                                     k->host_end - k->host_start, cbufp);
>  
>                   for (j = i + 1; j < mapnum; j++)
>                     if (!GOMP_MAP_POINTER_P (get_kind (short_mapkind, kinds,
> @@ -767,7 +915,7 @@ gomp_map_vars (struct gomp_device_descr
>                                           k->tgt_offset
>                                           + ((uintptr_t) hostaddrs[j]
>                                              - k->host_start),
> -                                         sizes[j]);
> +                                         sizes[j], cbufp);
>                         i++;
>                       }
>                   break;
> @@ -795,7 +943,7 @@ gomp_map_vars (struct gomp_device_descr
>                                       (void *) (tgt->tgt_start
>                                                 + k->tgt_offset),
>                                       (void *) k->host_start,
> -                                     sizeof (void *));
> +                                     sizeof (void *), cbufp);
>                   break;
>                 default:
>                   gomp_mutex_unlock (&devicep->lock);
> @@ -822,13 +970,23 @@ gomp_map_vars (struct gomp_device_descr
>        for (i = 0; i < mapnum; i++)
>       {
>         cur_node.tgt_offset = gomp_map_val (tgt, hostaddrs, i);
> -       /* FIXME: see above FIXME comment.  */
>         gomp_copy_host2dev (devicep,
>                             (void *) (tgt->tgt_start + i * sizeof (void *)),
> -                           (void *) &cur_node.tgt_offset, sizeof (void *));
> +                           (void *) &cur_node.tgt_offset, sizeof (void *),
> +                           cbufp);
>       }
>      }
>  
> +  if (cbufp)
> +    {
> +      long c = 0;
> +      for (c = 0; c < cbuf.chunk_cnt; ++c)
> +     gomp_copy_host2dev (devicep, (void *) (tgt->tgt_start + cbuf.chunks[2 * 
> c]),
> +                         (char *) cbuf.buf + (cbuf.chunks[2 * c] - 
> cbuf.chunks[0]),
> +                         cbuf.chunks[2 * c + 1] - cbuf.chunks[2 * c], NULL);
> +      free (cbuf.buf);
> +    }
> +
>    /* If the variable from "omp target enter data" map-list was already 
> mapped,
>       tgt is not needed.  Otherwise tgt will be freed by gomp_unmap_vars or
>       gomp_exit_data.  */
> @@ -970,7 +1128,7 @@ gomp_update (struct gomp_device_descr *d
>           size_t size = cur_node.host_end - cur_node.host_start;
>  
>           if (GOMP_MAP_COPY_TO_P (kind & typemask))
> -           gomp_copy_host2dev (devicep, devaddr, hostaddr, size);
> +           gomp_copy_host2dev (devicep, devaddr, hostaddr, size, NULL);
>           if (GOMP_MAP_COPY_FROM_P (kind & typemask))
>             gomp_copy_dev2host (devicep, hostaddr, devaddr, size);
>         }

Reply via email to