From: Andrew Stubbs <a...@codesourcery.com> This adds support for using Cuda Managed Memory with omp_alloc. It will be used as the underpinnings for "requires unified_shared_memory" in a later patch.
There are two new predefined allocators, ompx_gnu_unified_shared_mem_alloc and ompx_gnu_host_mem_alloc, plus corresponding memory spaces, which can be used to allocate memory in the "managed" space and explicitly on the host (it is intended that "malloc" will be intercepted by the compiler). The nvptx plugin is modified to make the necessary Cuda calls, and libgomp is modified to switch to shared-memory mode for USM allocated mappings. gcc/fortran/ChangeLog: * openmp.cc (is_predefined_allocator): Recognise new allocators. include/ChangeLog: * cuda/cuda.h (CUdevice_attribute): Add definitions for CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR and CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR. (CUmemAttach_flags): New. (CUpointer_attribute): New. (cuMemAllocManaged): New prototype. (cuPointerGetAttribute): New prototype. libgomp/ChangeLog: * allocator.c (ompx_gnu_max_predefined_alloc): Update. (predefined_ompx_gnu_alloc_mapping): Add ompx_gnu_unified_shared_mem_space and ompx_gnu_host_mem_space. (omp_init_allocator): Recognise ompx_gnu_pinned_mem_alloc and ompx_gnu_host_mem_space. * config/linux/allocator.c (linux_memspace_alloc): Support USM. (linux_memspace_calloc): Likewise. (linux_memspace_free): Likewise. (linux_memspace_realloc): Likewise. * config/nvptx/allocator.c (nvptx_memspace_alloc): Disallow host memory. (nvptx_memspace_calloc): Likewise. (nvptx_memspace_free): Likewise. (nvptx_memspace_realloc): Likewise. * libgomp-plugin.h (GOMP_OFFLOAD_usm_alloc): New prototype. (GOMP_OFFLOAD_usm_free): New prototype. (GOMP_OFFLOAD_is_usm_ptr): New prototype. * libgomp.h (gomp_usm_alloc): New prototype. (gomp_usm_free): New prototype. (OFFSET_USM): New define. (struct gomp_device_descr): Add USM functions. * omp.h.in (omp_memspace_handle_t): Add ompx_gnu_unified_shared_mem_space and ompx_gnu_host_mem_space. (omp_allocator_handle_t): Ad ompx_gnu_unified_shared_mem_alloc and ompx_gnu_host_mem_alloc. * omp_lib.f90.in: Likewise. * omp_lib.h.in: Likewise. * plugin/cuda-lib.def (cuMemAllocManaged): Add new call. (cuPointerGetAttribute): Likewise. * plugin/plugin-nvptx.c (nvptx_alloc): Add "usm" parameter. Call cuMemAllocManaged as appropriate. (GOMP_OFFLOAD_get_num_devices): Allow GOMP_REQUIRES_UNIFIED_SHARED_MEMORY if the device supports managed memory or integrated memory. (GOMP_OFFLOAD_alloc): Move internals to ... (GOMP_OFFLOAD_alloc_1): ... this, and add usm parameter. (GOMP_OFFLOAD_usm_alloc): New function. (GOMP_OFFLOAD_usm_free): New function. (GOMP_OFFLOAD_is_usm_ptr): New function. * target.c (gomp_map_pointer): Add USM support. (gomp_attach_pointer): Likewise. (gomp_map_val): Likewise. (gomp_map_vars_internal): Likewise. (gomp_usm_alloc): New function. (gomp_usm_free): New function. (gomp_load_plugin_for_device): Add usm_alloc, usm_free, and is_usm_ptr. * testsuite/lib/libgomp.exp (check_effective_target_omp_usm): New. * testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c: New test. * testsuite/libgomp.c/usm-1.c: New test. * testsuite/libgomp.c/usm-2.c: New test. * testsuite/libgomp.c/usm-3.c: New test. * testsuite/libgomp.c/usm-4.c: New test. * testsuite/libgomp.c/usm-5.c: New test. * testsuite/libgomp.fortran/usm-3.f90: New test. */testsuite/libgomp.c-c++-common/requires-5.c: Fix static data failure. co-authored-by: Kwok Cheung Yeung <kcye...@baylibre.com> co-authored-by: Thomas Schwinge <tschwi...@baylibre.com> --- gcc/fortran/openmp.cc | 8 +- include/cuda/cuda.h | 13 ++++ libgomp/allocator.c | 17 ++-- libgomp/config/linux/allocator.c | 21 ++++- libgomp/config/nvptx/allocator.c | 10 +++ libgomp/libgomp-plugin.h | 3 + libgomp/libgomp.h | 6 ++ libgomp/omp.h.in | 4 + libgomp/omp_lib.f90.in | 8 ++ libgomp/omp_lib.h.in | 10 +++ libgomp/plugin/cuda-lib.def | 2 + libgomp/plugin/plugin-nvptx.c | 52 +++++++++++-- libgomp/target.c | 77 ++++++++++++++++++- libgomp/testsuite/lib/libgomp.exp | 10 +++ .../libgomp.c-c++-common/requires-5.c | 3 +- .../alloc-ompx_gnu_host_mem_alloc-1.c | 77 +++++++++++++++++++ libgomp/testsuite/libgomp.c/usm-1.c | 25 ++++++ libgomp/testsuite/libgomp.c/usm-2.c | 33 ++++++++ libgomp/testsuite/libgomp.c/usm-3.c | 36 +++++++++ libgomp/testsuite/libgomp.c/usm-4.c | 37 +++++++++ libgomp/testsuite/libgomp.c/usm-5.c | 28 +++++++ libgomp/testsuite/libgomp.fortran/usm-3.f90 | 33 ++++++++ 22 files changed, 491 insertions(+), 22 deletions(-) create mode 100644 libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c create mode 100644 libgomp/testsuite/libgomp.c/usm-1.c create mode 100644 libgomp/testsuite/libgomp.c/usm-2.c create mode 100644 libgomp/testsuite/libgomp.c/usm-3.c create mode 100644 libgomp/testsuite/libgomp.c/usm-4.c create mode 100644 libgomp/testsuite/libgomp.c/usm-5.c create mode 100644 libgomp/testsuite/libgomp.fortran/usm-3.f90 diff --git a/gcc/fortran/openmp.cc b/gcc/fortran/openmp.cc index a177afb4974..548b36a4b62 100644 --- a/gcc/fortran/openmp.cc +++ b/gcc/fortran/openmp.cc @@ -7352,9 +7352,9 @@ resolve_omp_udr_clause (gfc_omp_namelist *n, gfc_namespace *ns, } /* Assume that a constant expression in the range 1 (omp_default_mem_alloc) - to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) is - fine. The original symbol name is already lost during matching via - gfc_match_expr. */ + to 8 (omp_thread_mem_alloc) range, or 200 (ompx_gnu_pinned_mem_alloc) to + 202 (ompx_gnu_host_mem_alloc) is fine. The original symbol name is already + lost during matching via gfc_match_expr. */ static bool is_predefined_allocator (gfc_expr *expr) { @@ -7366,7 +7366,7 @@ is_predefined_allocator (gfc_expr *expr) && ((mpz_sgn (expr->value.integer) > 0 && mpz_cmp_si (expr->value.integer, 8) <= 0) || (mpz_cmp_si (expr->value.integer, 200) >= 0 - && mpz_cmp_si (expr->value.integer, 200) <= 0))); + && mpz_cmp_si (expr->value.integer, 202) <= 0))); } /* Resolve declarative ALLOCATE statement. Note: Common block vars only appear diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 804d08ca57e..d0cfd0e471e 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -83,10 +83,21 @@ typedef enum { CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88 } CUdevice_attribute; +typedef enum { + CU_MEM_ATTACH_GLOBAL = 0x1 +} CUmemAttach_flags; + +typedef enum { + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8 +} CUpointer_attribute; + enum { CU_EVENT_DEFAULT = 0, CU_EVENT_DISABLE_TIMING = 2 @@ -247,6 +258,7 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); #define cuMemAllocHost cuMemAllocHost_v2 CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); +CUresult cuMemAllocManaged(CUdeviceptr *, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); @@ -287,6 +299,7 @@ CUresult cuModuleLoadData (CUmodule *, const void *); CUresult cuModuleUnload (CUmodule); CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int); +CUresult cuPointerGetAttribute(void *, CUpointer_attribute, CUdeviceptr); typedef void (*CUstreamCallback)(CUstream, CUresult, void *); CUresult cuStreamAddCallback(CUstream, CUstreamCallback, void *, unsigned int); CUresult cuStreamCreate (CUstream *, unsigned); diff --git a/libgomp/allocator.c b/libgomp/allocator.c index 91aa58e162b..1dee642684e 100644 --- a/libgomp/allocator.c +++ b/libgomp/allocator.c @@ -100,7 +100,7 @@ GOMP_is_alloc (void *ptr) #define omp_max_predefined_alloc omp_thread_mem_alloc #define ompx_gnu_min_predefined_alloc ompx_gnu_pinned_mem_alloc -#define ompx_gnu_max_predefined_alloc ompx_gnu_pinned_mem_alloc +#define ompx_gnu_max_predefined_alloc ompx_gnu_host_mem_alloc /* These macros may be overridden in config/<target>/allocator.c. The defaults (no override) are to return NULL for pinned memory requests @@ -146,6 +146,8 @@ static const omp_memspace_handle_t predefined_omp_alloc_mapping[] = { }; static const omp_memspace_handle_t predefined_ompx_gnu_alloc_mapping[] = { omp_default_mem_space, /* ompx_gnu_pinned_mem_alloc. */ + ompx_gnu_unified_shared_mem_space, /* ompx_gnu_unified_shared_mem_alloc. */ + ompx_gnu_host_mem_space, /* ompx_gnu_host_mem_alloc. */ }; #define ARRAY_SIZE(A) (sizeof (A) / sizeof ((A)[0])) @@ -380,7 +382,9 @@ omp_init_allocator (omp_memspace_handle_t memspace, int ntraits, struct omp_allocator_data *ret; int i; - if (memspace > omp_low_lat_mem_space) + if (memspace > omp_low_lat_mem_space + && (memspace < ompx_gnu_unified_shared_mem_space + || memspace > ompx_gnu_host_mem_space)) return omp_null_allocator; for (i = 0; i < ntraits; i++) switch (traits[i].key) @@ -743,7 +747,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) @@ -1057,7 +1062,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) @@ -1440,7 +1446,8 @@ fail:; int fallback = (allocator_data ? allocator_data->fallback : (allocator == omp_default_mem_alloc - || allocator == ompx_gnu_pinned_mem_alloc) + || allocator == ompx_gnu_pinned_mem_alloc + || allocator == ompx_gnu_host_mem_alloc) ? omp_atv_null_fb : omp_atv_default_mem_fb); switch (fallback) diff --git a/libgomp/config/linux/allocator.c b/libgomp/config/linux/allocator.c index 04214973cc9..81d2877b8f1 100644 --- a/libgomp/config/linux/allocator.c +++ b/libgomp/config/linux/allocator.c @@ -101,7 +101,9 @@ linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + addr = gomp_usm_alloc (size); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -192,7 +194,13 @@ linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin) /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + { + void *ret = gomp_usm_alloc (size); + memset (ret, 0, size); + return ret; + } + else if (pin) return linux_memspace_alloc (memspace, size, pin, true); else return calloc (1, size); @@ -208,7 +216,9 @@ linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + gomp_usm_free (addr); + else if (pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, @@ -234,7 +244,10 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr, /* Explicit pinning may not be required. */ pin = pin && !always_pinned_mode; - if (oldpin && pin) + if (memspace == ompx_gnu_unified_shared_mem_space) + /* Realloc is not implemented for USM. */ + ; + else if (oldpin && pin) { int using_device = __atomic_load_n (&using_device_for_page_locked, diff --git a/libgomp/config/nvptx/allocator.c b/libgomp/config/nvptx/allocator.c index acf8646e574..30b4e87ca67 100644 --- a/libgomp/config/nvptx/allocator.c +++ b/libgomp/config/nvptx/allocator.c @@ -42,6 +42,7 @@ chunks. */ #include "libgomp.h" +#include <assert.h> #include <stdlib.h> #define BASIC_ALLOC_PREFIX __nvptx_lowlat @@ -61,6 +62,8 @@ nvptx_memspace_alloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_alloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return malloc (size); } @@ -75,6 +78,8 @@ nvptx_memspace_calloc (omp_memspace_handle_t memspace, size_t size) return __nvptx_lowlat_calloc (shared_pool, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return calloc (1, size); } @@ -89,6 +94,9 @@ nvptx_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size) __nvptx_lowlat_free (shared_pool, addr, size); } + else if (memspace == ompx_gnu_host_mem_space) + /* Just verify what all allocator functions return. */ + assert (addr == NULL); else free (addr); } @@ -104,6 +112,8 @@ nvptx_memspace_realloc (omp_memspace_handle_t memspace, void *addr, return __nvptx_lowlat_realloc (shared_pool, addr, oldsize, size); } + else if (memspace == ompx_gnu_host_mem_space) + return NULL; else return realloc (addr, size); } diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 100dbca1633..404209802f8 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -140,6 +140,9 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *, extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *); extern void *GOMP_OFFLOAD_alloc (int, size_t); extern bool GOMP_OFFLOAD_free (int, void *); +extern void *GOMP_OFFLOAD_usm_alloc (int, size_t); +extern bool GOMP_OFFLOAD_usm_free (int, void *); +extern bool GOMP_OFFLOAD_is_usm_ptr (void *); extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t); extern bool GOMP_OFFLOAD_page_locked_host_free (void *); extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t); diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index f48bf7418f0..707fcdb39d7 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1128,6 +1128,8 @@ extern int gomp_get_num_devices (void); extern bool gomp_target_task_fn (void *); extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, int, struct goacc_asyncqueue *); +extern void *gomp_usm_alloc (size_t size); +extern void gomp_usm_free (void *device_ptr); extern bool gomp_page_locked_host_alloc (void **, size_t); extern void gomp_page_locked_host_free (void *); @@ -1192,6 +1194,7 @@ struct target_mem_desc; #define OFFSET_INLINED (~(uintptr_t) 0) #define OFFSET_POINTER (~(uintptr_t) 1) #define OFFSET_STRUCT (~(uintptr_t) 2) +#define OFFSET_USM (~(uintptr_t) 3) /* Auxiliary structure for infrequently-used or API-specific data. */ @@ -1412,6 +1415,9 @@ struct gomp_device_descr __typeof (GOMP_OFFLOAD_unload_image) *unload_image_func; __typeof (GOMP_OFFLOAD_alloc) *alloc_func; __typeof (GOMP_OFFLOAD_free) *free_func; + __typeof (GOMP_OFFLOAD_usm_alloc) *usm_alloc_func; + __typeof (GOMP_OFFLOAD_usm_free) *usm_free_func; + __typeof (GOMP_OFFLOAD_is_usm_ptr) *is_usm_ptr_func; __typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func; __typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func; __typeof (GOMP_OFFLOAD_dev2host) *dev2host_func; diff --git a/libgomp/omp.h.in b/libgomp/omp.h.in index 4438d341160..db400830113 100644 --- a/libgomp/omp.h.in +++ b/libgomp/omp.h.in @@ -120,6 +120,8 @@ typedef enum omp_memspace_handle_t __GOMP_UINTPTR_T_ENUM omp_const_mem_space = 2, omp_high_bw_mem_space = 3, omp_low_lat_mem_space = 4, + ompx_gnu_unified_shared_mem_space = 201, + ompx_gnu_host_mem_space = 202, __omp_memspace_handle_t_max__ = __UINTPTR_MAX__ } omp_memspace_handle_t; @@ -135,6 +137,8 @@ typedef enum omp_allocator_handle_t __GOMP_UINTPTR_T_ENUM omp_pteam_mem_alloc = 7, omp_thread_mem_alloc = 8, ompx_gnu_pinned_mem_alloc = 200, + ompx_gnu_unified_shared_mem_alloc = 201, + ompx_gnu_host_mem_alloc = 202, __omp_allocator_handle_t_max__ = __UINTPTR_MAX__ } omp_allocator_handle_t; diff --git a/libgomp/omp_lib.f90.in b/libgomp/omp_lib.f90.in index 545a79fcec9..f8d7d6a3e99 100644 --- a/libgomp/omp_lib.f90.in +++ b/libgomp/omp_lib.f90.in @@ -160,6 +160,10 @@ parameter :: omp_thread_mem_alloc = 8 integer (kind=omp_allocator_handle_kind), & parameter :: ompx_gnu_pinned_mem_alloc = 200 + integer (kind=omp_allocator_handle_kind), & + parameter :: ompx_gnu_unified_shared_mem_alloc = 201 + integer (kind=omp_allocator_handle_kind), & + parameter :: ompx_gnu_host_mem_alloc = 202 integer (omp_memspace_handle_kind), & parameter :: omp_default_mem_space = 0 integer (omp_memspace_handle_kind), & @@ -170,6 +174,10 @@ parameter :: omp_high_bw_mem_space = 3 integer (omp_memspace_handle_kind), & parameter :: omp_low_lat_mem_space = 4 + integer (omp_memspace_handle_kind), & + parameter :: ompx_gnu_unified_shared_mem_space = 201 + integer (omp_memspace_handle_kind), & + parameter :: ompx_gnu_host_mem_space = 202 integer, parameter :: omp_initial_device = -1 integer, parameter :: omp_invalid_device = -4 diff --git a/libgomp/omp_lib.h.in b/libgomp/omp_lib.h.in index b5144bb4144..b56d62d5994 100644 --- a/libgomp/omp_lib.h.in +++ b/libgomp/omp_lib.h.in @@ -156,6 +156,9 @@ integer (omp_allocator_handle_kind) omp_pteam_mem_alloc integer (omp_allocator_handle_kind) omp_thread_mem_alloc integer (omp_allocator_handle_kind) ompx_gnu_pinned_mem_alloc + integer (omp_allocator_handle_kind) & + & ompx_gnu_unified_shared_mem_alloc + integer (omp_allocator_handle_kind) ompx_gnu_host_mem_alloc parameter (omp_null_allocator = 0) parameter (omp_default_mem_alloc = 1) parameter (omp_large_cap_mem_alloc = 2) @@ -166,16 +169,23 @@ parameter (omp_pteam_mem_alloc = 7) parameter (omp_thread_mem_alloc = 8) parameter (ompx_gnu_pinned_mem_alloc = 200) + parameter (ompx_gnu_unified_shared_mem_alloc = 201) + parameter (ompx_gnu_host_mem_alloc = 202) integer (omp_memspace_handle_kind) omp_default_mem_space integer (omp_memspace_handle_kind) omp_large_cap_mem_space integer (omp_memspace_handle_kind) omp_const_mem_space integer (omp_memspace_handle_kind) omp_high_bw_mem_space integer (omp_memspace_handle_kind) omp_low_lat_mem_space + integer (omp_memspace_handle_kind) & + & ompx_gnu_unified_shared_mem_space + integer (omp_memspace_handle_kind) ompx_gnu_host_mem_space parameter (omp_default_mem_space = 0) parameter (omp_large_cap_mem_space = 1) parameter (omp_const_mem_space = 2) parameter (omp_high_bw_mem_space = 3) parameter (omp_low_lat_mem_space = 4) + parameter (ompx_gnu_unified_shared_mem_space = 201) + parameter (ompx_gnu_host_mem_space = 202) integer omp_initial_device, omp_invalid_device parameter (omp_initial_device = -1) parameter (omp_invalid_device = -4) diff --git a/libgomp/plugin/cuda-lib.def b/libgomp/plugin/cuda-lib.def index 007c6e0f4df..016aee8219f 100644 --- a/libgomp/plugin/cuda-lib.def +++ b/libgomp/plugin/cuda-lib.def @@ -30,6 +30,7 @@ CUDA_ONE_CALL (cuLinkDestroy) CUDA_ONE_CALL (cuMemAlloc) CUDA_ONE_CALL (cuMemAllocHost) CUDA_ONE_CALL (cuMemHostAlloc) +CUDA_ONE_CALL (cuMemAllocManaged) CUDA_ONE_CALL (cuMemcpy) CUDA_ONE_CALL (cuMemcpyDtoDAsync) CUDA_ONE_CALL (cuMemcpyDtoH) @@ -50,6 +51,7 @@ CUDA_ONE_CALL (cuModuleLoad) CUDA_ONE_CALL (cuModuleLoadData) CUDA_ONE_CALL (cuModuleUnload) CUDA_ONE_CALL_MAYBE_NULL (cuOccupancyMaxPotentialBlockSize) +CUDA_ONE_CALL (cuPointerGetAttribute) CUDA_ONE_CALL (cuStreamAddCallback) CUDA_ONE_CALL (cuStreamCreate) CUDA_ONE_CALL (cuStreamDestroy) diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 2ef3198ebe1..a6f86c94715 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1058,11 +1058,13 @@ nvptx_stacks_free (struct ptx_device *ptx_dev, bool force) } static void * -nvptx_alloc (size_t s, bool suppress_errors) +nvptx_alloc (size_t s, bool suppress_errors, bool usm) { CUdeviceptr d; - CUresult r = CUDA_CALL_NOCHECK (cuMemAlloc, &d, s); + CUresult r = (usm ? CUDA_CALL_NOCHECK (cuMemAllocManaged, &d, s, + CU_MEM_ATTACH_GLOBAL) + : CUDA_CALL_NOCHECK (cuMemAlloc, &d, s)); if (suppress_errors && r == CUDA_ERROR_OUT_OF_MEMORY) return NULL; else if (r != CUDA_SUCCESS) @@ -1229,8 +1231,13 @@ GOMP_OFFLOAD_get_num_devices (unsigned int omp_requires_mask) { int pi; CUresult r; + /* Check access via migration. */ r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, - CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS, dev); + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, dev); + if (r != CUDA_SUCCESS || pi == 0) + /* Check direct access. */ + r = CUDA_CALL_NOCHECK (cuDeviceGetAttribute, &pi, + CU_DEVICE_ATTRIBUTE_INTEGRATED, dev); if (r != CUDA_SUCCESS || pi == 0) return -1; } @@ -1598,8 +1605,8 @@ GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data) return ret; } -void * -GOMP_OFFLOAD_alloc (int ord, size_t size) +static void * +GOMP_OFFLOAD_alloc_1 (int ord, size_t size, bool usm) { if (!nvptx_attach_host_thread_to_device (ord)) return NULL; @@ -1622,7 +1629,7 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) blocks = tmp; } - void *d = nvptx_alloc (size, true); + void *d = nvptx_alloc (size, true, usm); if (d) return d; else @@ -1630,10 +1637,22 @@ GOMP_OFFLOAD_alloc (int ord, size_t size) /* Memory allocation failed. Try freeing the stacks block, and retrying. */ nvptx_stacks_free (ptx_dev, true); - return nvptx_alloc (size, false); + return nvptx_alloc (size, false, usm); } } +void * +GOMP_OFFLOAD_alloc (int ord, size_t size) +{ + return GOMP_OFFLOAD_alloc_1 (ord, size, false); +} + +void * +GOMP_OFFLOAD_usm_alloc (int ord, size_t size) +{ + return GOMP_OFFLOAD_alloc_1 (ord, size, true); +} + bool GOMP_OFFLOAD_free (int ord, void *ptr) { @@ -1641,6 +1660,25 @@ GOMP_OFFLOAD_free (int ord, void *ptr) && nvptx_free (ptr, ptx_devices[ord])); } +bool +GOMP_OFFLOAD_usm_free (int ord, void *ptr) +{ + return GOMP_OFFLOAD_free (ord, ptr); +} + +bool +GOMP_OFFLOAD_is_usm_ptr (void *ptr) +{ + bool managed = false; + /* This returns 3 outcomes ... + CUDA_ERROR_INVALID_VALUE - Not a Cuda allocated pointer. + CUDA_SUCCESS, managed:false - Cuda allocated, but not USM. + CUDA_SUCCESS, managed:true - USM. */ + CUDA_CALL_NOCHECK (cuPointerGetAttribute, &managed, + CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr)ptr); + return managed; +} + bool GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size) { diff --git a/libgomp/target.c b/libgomp/target.c index effd48bb92f..754dea4e031 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -706,7 +706,9 @@ gomp_map_pointer (struct target_mem_desc *tgt, struct goacc_asyncqueue *aq, splay_tree_key n = gomp_map_lookup (mem_map, &cur_node); if (n == NULL) { - if (allow_zero_length_array_sections) + if (allow_zero_length_array_sections + || (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func ((void*)cur_node.host_start))) cur_node.tgt_offset = cur_node.host_start; else { @@ -859,6 +861,11 @@ gomp_attach_pointer (struct gomp_device_descr *devicep, return; } + if (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func ((void*)(target + bias))) + /* Nothing to do here. */ + return; + s.host_start = target + bias; s.host_end = s.host_start + 1; tn = splay_tree_lookup (mem_map, &s); @@ -955,6 +962,7 @@ gomp_map_val (struct target_mem_desc *tgt, void **hostaddrs, size_t i) switch (tgt->list[i].offset) { case OFFSET_INLINED: + case OFFSET_USM: return (uintptr_t) hostaddrs[i]; case OFFSET_POINTER: @@ -1038,6 +1046,7 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, { int kind = get_kind (short_mapkind, kinds, i); bool implicit = get_implicit (short_mapkind, kinds, i); + tgt->list[i].offset = 0; if (hostaddrs[i] == NULL || (kind & typemask) == GOMP_MAP_FIRSTPRIVATE_INT) { @@ -1045,6 +1054,15 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, tgt->list[i].offset = OFFSET_INLINED; continue; } + else if (devicep->is_usm_ptr_func + && devicep->is_usm_ptr_func (hostaddrs[i])) + { + /* The memory is visible from both host and target + so nothing needs to be moved. */ + tgt->list[i].key = NULL; + tgt->list[i].offset = OFFSET_USM; + continue; + } else if ((kind & typemask) == GOMP_MAP_USE_DEVICE_PTR || (kind & typemask) == GOMP_MAP_USE_DEVICE_PTR_IF_PRESENT) { @@ -1398,6 +1416,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, bool implicit = get_implicit (short_mapkind, kinds, i); if (hostaddrs[i] == NULL) continue; + if (tgt->list[i].offset == OFFSET_USM) + continue; switch (kind & typemask) { size_t align, len, first, last; @@ -1595,6 +1615,8 @@ gomp_map_vars_internal (struct gomp_device_descr *devicep, continue; } default: + if (tgt->list[i].offset == OFFSET_INLINED) + continue; break; } splay_tree_key k = &array->key; @@ -4437,6 +4459,56 @@ omp_target_free (void *device_ptr, int device_num) gomp_mutex_unlock (&devicep->lock); } +void * +gomp_usm_alloc (size_t size) +{ + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + return NULL; + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + return malloc (size); + + void *ret = NULL; + gomp_mutex_lock (&devicep->lock); + if (devicep->usm_alloc_func) + ret = devicep->usm_alloc_func (devicep->target_id, size); + gomp_mutex_unlock (&devicep->lock); + return ret; +} + +void +gomp_usm_free (void *device_ptr) +{ + if (device_ptr == NULL) + return; + + struct gomp_task_icv *icv = gomp_icv (false); + struct gomp_device_descr *devicep = resolve_device (icv->default_device_var, + false); + if (devicep == NULL) + return; + + if (!(devicep->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) + || devicep->capabilities & GOMP_OFFLOAD_CAP_SHARED_MEM) + { + free (device_ptr); + return; + } + + gomp_mutex_lock (&devicep->lock); + if (devicep->usm_free_func + && !devicep->usm_free_func (devicep->target_id, device_ptr)) + { + gomp_mutex_unlock (&devicep->lock); + gomp_fatal ("error in freeing device memory block at %p", device_ptr); + } + gomp_mutex_unlock (&devicep->lock); +} + /* Device (really: libgomp plugin) to use for paged-locked memory. We assume there is either none or exactly one such device for the lifetime of the process. */ @@ -5294,6 +5366,9 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (unload_image); DLSYM (alloc); DLSYM (free); + DLSYM_OPT (usm_alloc, usm_alloc); + DLSYM_OPT (usm_free, usm_free); + DLSYM_OPT (is_usm_ptr, is_usm_ptr); DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc); DLSYM_OPT (page_locked_host_free, page_locked_host_free); DLSYM (dev2host); diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index 7c109262916..007bdf2d5c4 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -594,3 +594,13 @@ int main() { return 0; } } "-lcuda -lcudart" ] } + +# return 1 if OpenMP Unified Share Memory is supported + +proc check_effective_target_omp_usm { } { + if { [libgomp_check_effective_target_offload_target "nvptx"] } { + return 1 + } + return 0 +} + diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c index d43d78db6fa..0f839ef2957 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c @@ -11,12 +11,13 @@ #pragma omp requires unified_shared_memory, unified_address, reverse_offload -int a[10] = { 0 }; extern void foo (void); int main (void) { + int *a = (int*)__builtin_calloc(10, sizeof (int)); + #pragma omp target map(to: a) for (int i = 0; i < 10; i++) a[i] = i; diff --git a/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c b/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c new file mode 100644 index 00000000000..87a84db8953 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/alloc-ompx_gnu_host_mem_alloc-1.c @@ -0,0 +1,77 @@ +/* Verify that on the host we can but on a device we cannot allocate 'ompx_gnu_host_mem_alloc' memory. */ + +/* { dg-additional-options -DOFFLOAD_DEVICE { target offload_device } } */ + +#include <omp.h> + +#pragma omp requires dynamic_allocators + +int main() +{ +#pragma omp target + { + char *c, *c_; + + c = omp_alloc(1, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + omp_free(c, ompx_gnu_host_mem_alloc); + + c = omp_aligned_alloc(128, 256, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + omp_free(c, omp_null_allocator); + + c = omp_calloc(1, 1, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + c_ = omp_realloc(c, 2, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c_) + __builtin_abort (); +#else + if (!c_) + __builtin_abort (); +#endif + c = omp_realloc(c_, 0, ompx_gnu_host_mem_alloc, ompx_gnu_host_mem_alloc); + if (c) + __builtin_abort (); + + c = omp_aligned_calloc(64, 1, 512, ompx_gnu_host_mem_alloc); +#ifdef OFFLOAD_DEVICE + if (c) + __builtin_abort (); +#else + if (!c) + __builtin_abort (); +#endif + c_ = omp_realloc(c, 2, c ? omp_null_allocator : ompx_gnu_host_mem_alloc, omp_null_allocator); +#ifdef OFFLOAD_DEVICE + if (c_) + __builtin_abort (); +#else + if (!c_) + __builtin_abort (); +#endif + c = omp_realloc(c_, 0, omp_null_allocator, omp_null_allocator); + if (c) + __builtin_abort (); + } + + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-1.c b/libgomp/testsuite/libgomp.c/usm-1.c new file mode 100644 index 00000000000..c8e8a9328ee --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-1.c @@ -0,0 +1,25 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + *a = 42; + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target is_device_ptr(a) + { + if (*a != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-2.c b/libgomp/testsuite/libgomp.c/usm-2.c new file mode 100644 index 00000000000..9f414b16319 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-2.c @@ -0,0 +1,33 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + + #pragma omp target map(a[0]) + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + #pragma omp target map(a[1]) + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-3.c b/libgomp/testsuite/libgomp.c/usm-3.c new file mode 100644 index 00000000000..d7a77a5c2ee --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-3.c @@ -0,0 +1,36 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target data map(a[0:2]) + { +#pragma omp target + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + } + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-4.c b/libgomp/testsuite/libgomp.c/usm-4.c new file mode 100644 index 00000000000..825bb4e8b3e --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-4.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int)*2, ompx_gnu_unified_shared_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + a[1] = 43; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target enter data map(to:a[0:2]) + +#pragma omp target + { + if (a[0] != 42 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target + { + if (a[1] != 43 || a_p != (uintptr_t)a) + __builtin_abort (); + } + +#pragma omp target exit data map(delete:a[0:2]) + + omp_free(a, ompx_gnu_unified_shared_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.c/usm-5.c b/libgomp/testsuite/libgomp.c/usm-5.c new file mode 100644 index 00000000000..00332050591 --- /dev/null +++ b/libgomp/testsuite/libgomp.c/usm-5.c @@ -0,0 +1,28 @@ +/* { dg-do run } */ +/* { dg-require-effective-target omp_usm } */ + +#include <omp.h> +#include <stdint.h> + +#pragma omp requires unified_shared_memory + +int +main () +{ + int *a = (int *) omp_alloc(sizeof(int), ompx_gnu_host_mem_alloc); + if (!a) + __builtin_abort (); + + a[0] = 42; + + uintptr_t a_p = (uintptr_t)a; + +#pragma omp target map(a[0:1]) + { + if (a[0] != 42 || a_p == (uintptr_t)a) + __builtin_abort (); + } + + omp_free(a, ompx_gnu_host_mem_alloc); + return 0; +} diff --git a/libgomp/testsuite/libgomp.fortran/usm-3.f90 b/libgomp/testsuite/libgomp.fortran/usm-3.f90 new file mode 100644 index 00000000000..ff15f4ba1f5 --- /dev/null +++ b/libgomp/testsuite/libgomp.fortran/usm-3.f90 @@ -0,0 +1,33 @@ +! { dg-do run } +! { dg-require-effective-target omp_usm } + +! Ensure that derived types containing allocated values work +! with Unified Shared Memory. + +program usm +!$omp requires unified_shared_memory + use iso_fortran_env + implicit none + + type :: struct + real(real64), allocatable :: v(:) + end type struct + + integer :: index + type(struct) :: s + + real(real64), allocatable :: expected(:) + + allocate(s%v(100)) + do index = 1, size(s%v) + s%v(index) = index + end do + allocate(expected, mold=s%v) + expected = s%v - 1._real64 + + !$omp target + s%v = s%v - 1._real64 + !$omp end target + + if (any(s%v /= expected)) STOP 1 +end program usm -- 2.41.0