https://github.com/AntonRydahl updated https://github.com/llvm/llvm-project/pull/66968
>From b35340e47de896c9933c54ce617538c46cf01488 Mon Sep 17 00:00:00 2001 From: AntonRydahl <rydahl2...@gmail.com> Date: Wed, 20 Sep 2023 17:06:10 -0700 Subject: [PATCH 1/5] Adding OpenMP Offloading Backend for C++ Parallel Algorithms --- libcxx/CMakeLists.txt | 14 +++ libcxx/include/CMakeLists.txt | 5 + libcxx/include/__algorithm/pstl_backend.h | 8 ++ .../__algorithm/pstl_backends/gpu_backend.h | 21 +++++ .../pstl_backends/gpu_backends/backend.h | 33 +++++++ .../pstl_backends/gpu_backends/fill.h | 59 ++++++++++++ .../pstl_backends/gpu_backends/for_each.h | 59 ++++++++++++ .../pstl_backends/gpu_backends/omp_offload.h | 91 +++++++++++++++++++ libcxx/include/__config_site.in | 1 + 9 files changed, 291 insertions(+) create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backend.h create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index bb2898b799bcef9..43d2a448de79584 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -290,6 +290,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32 option(LIBCXX_HAS_EXTERNAL_THREAD_API "Build libc++ with an externalized threading API. This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF) +option(LIBCXX_ENABLE_GPU_OFFLOAD + "Build libc++ with support for GPU offload" OFF) if (LIBCXX_ENABLE_THREADS) set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use") @@ -297,6 +299,14 @@ else() set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use") endif() +if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND) + if (${LIBCXX_ENABLE_GPU_OFFLOAD}) + set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use") + else() + set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use") + endif() +endif() + # Misc options ---------------------------------------------------------------- # FIXME: Turn -pedantic back ON. It is currently off because it warns # about #include_next which is used everywhere. @@ -809,6 +819,10 @@ else() Valid backends are: serial, std_thread and libdispatch") endif() +if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload") + config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +endif() + if (LIBCXX_ABI_DEFINES) set(abi_defines) foreach (abi_define ${LIBCXX_ABI_DEFINES}) diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 2ec755236dbaee2..a3d72df61a86dde 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -85,6 +85,11 @@ set(files __algorithm/pstl_backends/cpu_backends/thread.h __algorithm/pstl_backends/cpu_backends/transform.h __algorithm/pstl_backends/cpu_backends/transform_reduce.h + __algorithm/pstl_backends/gpu_backend.h + __algorithm/pstl_backends/gpu_backends/backend.h + __algorithm/pstl_backends/gpu_backends/fill.h + __algorithm/pstl_backends/gpu_backends/for_each.h + __algorithm/pstl_backends/gpu_backends/omp_offload.h __algorithm/pstl_copy.h __algorithm/pstl_count.h __algorithm/pstl_fill.h diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h index 93372f019031b63..f051e0ce9be13c3 100644 --- a/libcxx/include/__algorithm/pstl_backend.h +++ b/libcxx/include/__algorithm/pstl_backend.h @@ -10,6 +10,7 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H #include <__algorithm/pstl_backends/cpu_backend.h> +#include <__algorithm/pstl_backends/gpu_backend.h> #include <__config> #include <execution> @@ -179,10 +180,17 @@ struct __select_backend<std::execution::parallel_policy> { using type = __cpu_backend_tag; }; +# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +template <> +struct __select_backend<std::execution::parallel_unsequenced_policy> { + using type = __gpu_backend_tag; +}; +# else template <> struct __select_backend<std::execution::parallel_unsequenced_policy> { using type = __cpu_backend_tag; }; +# endif # else diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h new file mode 100644 index 000000000000000..46a85f77b5deb99 --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H + +#include <__config> + +#include <__algorithm/pstl_backends/gpu_backends/backend.h> + +#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +#include <__algorithm/pstl_backends/gpu_backends/fill.h> +#include <__algorithm/pstl_backends/gpu_backends/for_each.h> +#endif + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h new file mode 100644 index 000000000000000..a8b400afbb94d9d --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H + +#include <__config> +#include <cstddef> + +#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h> +#endif + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +struct __gpu_backend_tag {}; + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h new file mode 100644 index 000000000000000..5603e18a5d2d3fc --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H + +#include <__algorithm/fill.h> +#include <__algorithm/pstl_backends/gpu_backends/backend.h> +#include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__config> +#include <__iterator/concepts.h> +#include <__type_traits/is_execution_policy.h> +#include <__utility/terminate_on_exception.h> +#include <stdio.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template <class _ExecutionPolicy, class _ForwardIterator, class _Tp> +_LIBCPP_HIDE_FROM_ABI void +__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { + // It is only safe to execute for_each on the GPU, it the execution policy is + // parallel unsequenced, as it is the only execution policy prohibiting throwing + // exceptions and allowing SIMD instructions + if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value); + } + // Else if the excution policy is parallel, we execute for_each on the CPU instead + else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + std::__terminate_on_exception([&] { + __par_backend::__parallel_for( + __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { + std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>( + __cpu_backend_tag{}, __brick_first, __brick_last, __value); + }); + }); + // Else we execute for_each in serial + } else { + std::fill(__first, __last, __value); + } +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h new file mode 100644 index 000000000000000..20486d83863f420 --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h @@ -0,0 +1,59 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H + +#include <__algorithm/for_each.h> +#include <__algorithm/pstl_backends/gpu_backends/backend.h> +#include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__config> +#include <__iterator/concepts.h> +#include <__type_traits/is_execution_policy.h> +#include <__utility/terminate_on_exception.h> +#include <stdio.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template <class _ExecutionPolicy, class _ForwardIterator, class _Functor> +_LIBCPP_HIDE_FROM_ABI void +__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) { + // It is only safe to execute for_each on the GPU, it the execution policy is + // parallel unsequenced, as it is the only execution policy prohibiting throwing + // exceptions and allowing SIMD instructions + if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func); + } + // Else if the excution policy is parallel, we execute for_each on the CPU instead + else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + std::__terminate_on_exception([&] { + std::__par_backend::__parallel_for( + __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { + std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>( + __cpu_backend_tag{}, __brick_first, __brick_last, __func); + }); + }); + // Else we execute for_each in serial + } else { + std::for_each(__first, __last, __func); + } +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h new file mode 100644 index 000000000000000..840118dbec5057c --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h @@ -0,0 +1,91 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H + +#include <__assert> +#include <__config> +#include <__utility/move.h> +#include <cstddef> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_PUSH_MACROS +#include <__undef_macros> + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +namespace __par_backend { +inline namespace __omp_gpu_backend { + +// In OpenMP, we need to extract the pointer for the underlying data for data +// structures like std::vector and std::array to be able to map the data to the +// device. + +template <typename T> +_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) { + return p; +} + +template <typename T> +_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) { + std::pointer_traits<std::__wrap_iter<T>> PT; + return PT.to_address(w); +} + +// Applying function or lambda in a loop + +template <class _Iterator, class _DifferenceType, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { + #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__first[__i]); + + return __first + __n; +} + +// Extracting the underlying pointer + +template <class _Iterator, class _DifferenceType, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { + __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f); + return __first + __n; +} + +// Assigning a value in a loop + +template <class _Index, class _DifferenceType, class _Tp> +_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { + #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __first[__i] = __value; + + return __first + __n; +} + +template <class _Index, class _DifferenceType, class _Tp> +_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { + __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value); + return __first + __n; +} + +} // namespace __omp_gpu_backend +} // namespace __par_backend + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17 + +_LIBCPP_POP_MACROS + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index c85cbcd02c441b9..e0edddce3afc3ff 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -34,6 +34,7 @@ #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH +#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD // Hardening. #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT >From af5ddf7709e44435c3b0b15421aa9cfc24b49e84 Mon Sep 17 00:00:00 2001 From: antonrydahl <rydahl2...@gmail.com> Date: Wed, 20 Sep 2023 17:48:25 -0700 Subject: [PATCH 2/5] Clang formatting OpenMP backend for parallel algorithms --- libcxx/include/__algorithm/pstl_backend.h | 6 +++--- .../include/__algorithm/pstl_backends/gpu_backend.h | 4 ++-- .../__algorithm/pstl_backends/gpu_backends/fill.h | 12 ++++++------ .../pstl_backends/gpu_backends/for_each.h | 12 ++++++------ .../pstl_backends/gpu_backends/omp_offload.h | 13 ++++++++----- 5 files changed, 25 insertions(+), 22 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h index f051e0ce9be13c3..47f5191b48517ba 100644 --- a/libcxx/include/__algorithm/pstl_backend.h +++ b/libcxx/include/__algorithm/pstl_backend.h @@ -180,17 +180,17 @@ struct __select_backend<std::execution::parallel_policy> { using type = __cpu_backend_tag; }; -# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) template <> struct __select_backend<std::execution::parallel_unsequenced_policy> { using type = __gpu_backend_tag; }; -# else +# else template <> struct __select_backend<std::execution::parallel_unsequenced_policy> { using type = __cpu_backend_tag; }; -# endif +# endif # else diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h index 46a85f77b5deb99..7237036156a1bf3 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h @@ -14,8 +14,8 @@ #include <__algorithm/pstl_backends/gpu_backends/backend.h> #if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) -#include <__algorithm/pstl_backends/gpu_backends/fill.h> -#include <__algorithm/pstl_backends/gpu_backends/for_each.h> +# include <__algorithm/pstl_backends/gpu_backends/fill.h> +# include <__algorithm/pstl_backends/gpu_backends/for_each.h> #endif #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h index 5603e18a5d2d3fc..32926da87e2a083 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h @@ -10,8 +10,8 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H #include <__algorithm/fill.h> -#include <__algorithm/pstl_backends/gpu_backends/backend.h> #include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__algorithm/pstl_backends/gpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> #include <__type_traits/is_execution_policy.h> @@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _ExecutionPolicy, class _ForwardIterator, class _Tp> _LIBCPP_HIDE_FROM_ABI void __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) { - // It is only safe to execute for_each on the GPU, it the execution policy is + // It is only safe to execute for_each on the GPU, it the execution policy is // parallel unsequenced, as it is the only execution policy prohibiting throwing // exceptions and allowing SIMD instructions if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && - __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value); } // Else if the excution policy is parallel, we execute for_each on the CPU instead - else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && - __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { std::__terminate_on_exception([&] { __par_backend::__parallel_for( __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { @@ -46,7 +46,7 @@ __pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last __cpu_backend_tag{}, __brick_first, __brick_last, __value); }); }); - // Else we execute for_each in serial + // Else we execute for_each in serial } else { std::fill(__first, __last, __value); } diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h index 20486d83863f420..14de2af8e4a15c6 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h @@ -10,8 +10,8 @@ #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H #include <__algorithm/for_each.h> -#include <__algorithm/pstl_backends/gpu_backends/backend.h> #include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__algorithm/pstl_backends/gpu_backends/backend.h> #include <__config> #include <__iterator/concepts.h> #include <__type_traits/is_execution_policy.h> @@ -29,16 +29,16 @@ _LIBCPP_BEGIN_NAMESPACE_STD template <class _ExecutionPolicy, class _ForwardIterator, class _Functor> _LIBCPP_HIDE_FROM_ABI void __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) { - // It is only safe to execute for_each on the GPU, it the execution policy is + // It is only safe to execute for_each on the GPU, it the execution policy is // parallel unsequenced, as it is the only execution policy prohibiting throwing // exceptions and allowing SIMD instructions if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && - __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func); } // Else if the excution policy is parallel, we execute for_each on the CPU instead - else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && - __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { + else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) { std::__terminate_on_exception([&] { std::__par_backend::__parallel_for( __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { @@ -46,7 +46,7 @@ __pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __ __cpu_backend_tag{}, __brick_first, __brick_last, __func); }); }); - // Else we execute for_each in serial + // Else we execute for_each in serial } else { std::for_each(__first, __last, __func); } diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h index 840118dbec5057c..4baa4e7f65859d1 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h @@ -46,8 +46,9 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) { // Applying function or lambda in a loop template <class _Iterator, class _DifferenceType, class _Function> -_LIBCPP_HIDE_FROM_ABI _Iterator __omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { - #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) +_LIBCPP_HIDE_FROM_ABI _Iterator +__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { +# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first[__i]); @@ -65,8 +66,9 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ // Assigning a value in a loop template <class _Index, class _DifferenceType, class _Tp> -_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { - #pragma omp target teams distribute parallel for simd map(tofrom:__first[0:__n]) map(to:__value) +_LIBCPP_HIDE_FROM_ABI _Index +__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { +# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value) for (_DifferenceType __i = 0; __i < __n; ++__i) __first[__i] = __value; @@ -74,7 +76,8 @@ _LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1(_Index __first, _Diff } template <class _Index, class _DifferenceType, class _Tp> -_LIBCPP_HIDE_FROM_ABI _Index __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { +_LIBCPP_HIDE_FROM_ABI _Index +__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value); return __first + __n; } >From 57abf3062c4e559fddd6bf173d415212b9f92e43 Mon Sep 17 00:00:00 2001 From: AntonRydahl <rydahl2...@gmail.com> Date: Thu, 21 Sep 2023 12:50:17 -0700 Subject: [PATCH 3/5] Making PSTL GPU backend depend on CMake options rather than command line options --- libcxx/CMakeLists.txt | 11 +++++------ libcxx/include/__algorithm/pstl_backend.h | 2 +- .../include/__algorithm/pstl_backends/gpu_backend.h | 2 +- .../__algorithm/pstl_backends/gpu_backends/backend.h | 8 ++++++-- libcxx/include/__config_site.in | 1 + 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index 43d2a448de79584..7aa47caa1ca335a 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -299,12 +299,10 @@ else() set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use") endif() -if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND) - if (${LIBCXX_ENABLE_GPU_OFFLOAD}) - set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use") - else() - set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use") - endif() +if (${LIBCXX_ENABLE_GPU_OFFLOAD}) + set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use") +else() + set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use") endif() # Misc options ---------------------------------------------------------------- @@ -819,6 +817,7 @@ else() Valid backends are: serial, std_thread and libdispatch") endif() +config_define_if(LIBCXX_ENABLE_GPU_OFFLOAD _LIBCPP_PSTL_GPU_OFFLOAD) if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload") config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) endif() diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h index 47f5191b48517ba..0df8847fd33589a 100644 --- a/libcxx/include/__algorithm/pstl_backend.h +++ b/libcxx/include/__algorithm/pstl_backend.h @@ -180,7 +180,7 @@ struct __select_backend<std::execution::parallel_policy> { using type = __cpu_backend_tag; }; -# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +# if defined(_LIBCPP_PSTL_GPU_OFFLOAD) template <> struct __select_backend<std::execution::parallel_unsequenced_policy> { using type = __gpu_backend_tag; diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h index 7237036156a1bf3..d2a814b441224a5 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h @@ -13,7 +13,7 @@ #include <__algorithm/pstl_backends/gpu_backends/backend.h> -#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +#if defined(_LIBCPP_PSTL_GPU_OFFLOAD) # include <__algorithm/pstl_backends/gpu_backends/fill.h> # include <__algorithm/pstl_backends/gpu_backends/for_each.h> #endif diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h index a8b400afbb94d9d..a03ad35d8d2ae3e 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h @@ -12,8 +12,12 @@ #include <__config> #include <cstddef> -#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) -# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h> +#if defined(_LIBCPP_PSTL_GPU_OFFLOAD) +# if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD) +# include <__algorithm/pstl_backends/gpu_backends/omp_offload.h> +# else +# error Invalid PSTL GPU backend +# endif #endif #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in index e0edddce3afc3ff..e7fb4f423079333 100644 --- a/libcxx/include/__config_site.in +++ b/libcxx/include/__config_site.in @@ -35,6 +35,7 @@ #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH #cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD +#cmakedefine _LIBCPP_PSTL_GPU_OFFLOAD // Hardening. #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT >From 51d9ed5702a46ac604bafbe2f707033639f86706 Mon Sep 17 00:00:00 2001 From: AntonRydahl <rydahl2...@gmail.com> Date: Thu, 21 Sep 2023 17:07:58 -0700 Subject: [PATCH 4/5] Added OpenMP offloaded version of std::transform --- libcxx/include/CMakeLists.txt | 1 + .../__algorithm/pstl_backends/gpu_backend.h | 1 + .../pstl_backends/gpu_backends/omp_offload.h | 119 +++++++++++++++++- .../pstl_backends/gpu_backends/transform.h | 117 +++++++++++++++++ 4 files changed, 233 insertions(+), 5 deletions(-) create mode 100644 libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index a3d72df61a86dde..66e54cfbf1780ee 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -90,6 +90,7 @@ set(files __algorithm/pstl_backends/gpu_backends/fill.h __algorithm/pstl_backends/gpu_backends/for_each.h __algorithm/pstl_backends/gpu_backends/omp_offload.h + __algorithm/pstl_backends/gpu_backends/transform.h __algorithm/pstl_copy.h __algorithm/pstl_count.h __algorithm/pstl_fill.h diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h index d2a814b441224a5..dac26592dac5c1f 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h @@ -16,6 +16,7 @@ #if defined(_LIBCPP_PSTL_GPU_OFFLOAD) # include <__algorithm/pstl_backends/gpu_backends/fill.h> # include <__algorithm/pstl_backends/gpu_backends/for_each.h> +# include <__algorithm/pstl_backends/gpu_backends/transform.h> #endif #endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h index 4baa4e7f65859d1..69221cbb8519233 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h @@ -28,6 +28,17 @@ _LIBCPP_BEGIN_NAMESPACE_STD namespace __par_backend { inline namespace __omp_gpu_backend { +// Checking if a pointer is in a range +template <typename T1, typename T2, typename T3> +_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T1 a, T2 p, T3 b) { + return false; +} + +template <typename T> +_LIBCPP_HIDE_FROM_ABI inline bool __omp_in_ptr_range(T* a, T* p, T* b) { + return std::less_equal<T*>{}(a, p) && std::less<T*>{}(p, b); +} + // In OpenMP, we need to extract the pointer for the underlying data for data // structures like std::vector and std::array to be able to map the data to the // device. @@ -43,12 +54,16 @@ _LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) { return PT.to_address(w); } +//===----------------------------------------------------------------------===// +// Templates for one iterator +//===----------------------------------------------------------------------===// + // Applying function or lambda in a loop template <class _Iterator, class _DifferenceType, class _Function> _LIBCPP_HIDE_FROM_ABI _Iterator -__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept { -# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) +__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f, const int __device = 0) noexcept { +# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) device(__device) for (_DifferenceType __i = 0; __i < __n; ++__i) __f(__first[__i]); @@ -66,9 +81,10 @@ _LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _Differ // Assigning a value in a loop template <class _Index, class _DifferenceType, class _Tp> -_LIBCPP_HIDE_FROM_ABI _Index -__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept { -# pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value) +_LIBCPP_HIDE_FROM_ABI _Index __omp_parallel_for_simd_val_1( + _Index __first, _DifferenceType __n, const _Tp& __value, const int __device = 0) noexcept { +# pragma omp target teams distribute parallel for simd map(from : __first[0 : __n]) map(always, to : __value) \ + device(__device) for (_DifferenceType __i = 0; __i < __n; ++__i) __first[__i] = __value; @@ -82,6 +98,99 @@ __parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __valu return __first + __n; } +//===----------------------------------------------------------------------===// +// Templates for two iterators +//===----------------------------------------------------------------------===// + +template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_2( + _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f, const int __device = 0) noexcept { + if ((!std::is_same<_Iterator1, _Iterator2>::value) || + (std::is_same<_Iterator1, _Iterator2>::value && + !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first2, __first1 + __n))) { +# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n]) map(from : __first2[0 : __n]) \ + device(__device) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__first1[__i], __first2[__i]); + return __first1 + __n; + } +# pragma omp target teams distribute parallel for simd map(tofrom : __first1[0 : __n], __first2[0 : __n]) \ + device(__device) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__first1[__i], __first2[__i]); + + return __first1 + __n; +} + +// Extracting the underlying pointer + +template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator1 +__parallel_for_simd_2(_Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Function __f) noexcept { + __omp_parallel_for_simd_2( + __omp_gpu_backend::__omp_extract_base_ptr(__first1), + __n, + __omp_gpu_backend::__omp_extract_base_ptr(__first2), + __f); + return __first1 + __n; +} + +//===----------------------------------------------------------------------===// +// Templates for three iterator +//===----------------------------------------------------------------------===// + +template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator1 __omp_parallel_for_simd_3( + _Iterator1 __first1, + _DifferenceType __n, + _Iterator2 __first2, + _Iterator3 __first3, + _Function __f, + const int __device = 0) noexcept { + // It may be that __first3 is in the interval [__first1+__n) or [__firt2+__n) + // It is, however, undefined behavior to compare two pointers that do not + // point to the same object or are not the same type. + // If we can prove that __first3 is not in any of the ranges [__first1+__n) + // or [__firt2+__n), it is safe to reduce the amount of data copied to and + // from the device + constexpr bool are_not_same_type = + !std::is_same<_Iterator1, _Iterator2>::value && !std::is_same<_Iterator1, _Iterator3>::value; + const bool no_overlap_13 = + std::is_same<_Iterator1, _Iterator3>::value && + !__omp_gpu_backend::__omp_in_ptr_range(__first1, __first3, __first1 + __n); + const bool no_overlap_23 = + std::is_same<_Iterator2, _Iterator3>::value && + !__omp_gpu_backend::__omp_in_ptr_range(__first2, __first3, __first2 + __n); + if (are_not_same_type || (no_overlap_13 && no_overlap_23)) { +# pragma omp target teams distribute parallel for simd map(to : __first1[0 : __n], __first2[0 : __n]) \ + map(from : __first3[0 : __n]) device(__device) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__first1[__i], __first2[__i], __first3[__i]); + return __first1 + __n; + } + // In the general case, we have to map all data to and from the device +# pragma omp target teams distribute parallel for simd map( \ + tofrom : __first1[0 : __n], __first2[0 : __n], __first3[0 : __n]) device(__device) + for (_DifferenceType __i = 0; __i < __n; ++__i) + __f(__first1[__i], __first2[__i], __first3[__i]); + + return __first1 + __n; +} + +// Extracting the underlying pointer + +template <class _Iterator1, class _DifferenceType, class _Iterator2, class _Iterator3, class _Function> +_LIBCPP_HIDE_FROM_ABI _Iterator1 __parallel_for_simd_3( + _Iterator1 __first1, _DifferenceType __n, _Iterator2 __first2, _Iterator3 __first3, _Function __f) noexcept { + __omp_parallel_for_simd_3( + __omp_gpu_backend::__omp_extract_base_ptr(__first1), + __n, + __omp_gpu_backend::__omp_extract_base_ptr(__first2), + __omp_gpu_backend::__omp_extract_base_ptr(__first3), + __f); + return __first1 + __n; +} + } // namespace __omp_gpu_backend } // namespace __par_backend diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h new file mode 100644 index 000000000000000..03eba11a3f5f52b --- /dev/null +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H +#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H + +#include <__algorithm/pstl_backends/cpu_backends/backend.h> +#include <__algorithm/pstl_backends/gpu_backends/backend.h> +#include <__algorithm/transform.h> +#include <__config> +#include <__iterator/concepts.h> +#include <__iterator/iterator_traits.h> +#include <__type_traits/enable_if.h> +#include <__type_traits/is_execution_policy.h> +#include <__type_traits/remove_cvref.h> +#include <__utility/terminate_on_exception.h> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +_LIBCPP_BEGIN_NAMESPACE_STD + +template <class _ExecutionPolicy, class _ForwardIterator, class _ForwardOutIterator, class _UnaryOperation> +_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( + __gpu_backend_tag, + _ForwardIterator __first, + _ForwardIterator __last, + _ForwardOutIterator __result, + _UnaryOperation __op) { + if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && + __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + return std::__par_backend::__parallel_for_simd_2( + __first, + __last - __first, + __result, + [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) { + __out_value = __op(__in_value); + }); + } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && + __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + std::__terminate_on_exception([&] { + std::__par_backend::__parallel_for( + __first, __last, [__op, __first, __result](_ForwardIterator __brick_first, _ForwardIterator __brick_last) { + return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>( + __cpu_backend_tag{}, __brick_first, __brick_last, __result + (__brick_first - __first), __op); + }); + }); + return __result + (__last - __first); + } else { + return std::transform(__first, __last, __result, __op); + } +} + +template <class _ExecutionPolicy, + class _ForwardIterator1, + class _ForwardIterator2, + class _ForwardOutIterator, + class _BinaryOperation, + enable_if_t<is_execution_policy_v<__remove_cvref_t<_ExecutionPolicy>>, int> = 0> +_LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( + __gpu_backend_tag, + _ForwardIterator1 __first1, + _ForwardIterator1 __last1, + _ForwardIterator2 __first2, + _ForwardOutIterator __result, + _BinaryOperation __op) { + if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && + __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && + __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + return std::__par_backend::__parallel_for_simd_3( + __first1, + __last1 - __first1, + __first2, + __result, + [&](__iter_reference<_ForwardIterator1> __in1, + __iter_reference<_ForwardIterator2> __in2, + __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); }); + } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && + __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && + __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && + __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + std::__terminate_on_exception([&] { + std::__par_backend::__parallel_for( + __first1, + __last1, + [__op, __first1, __first2, __result](_ForwardIterator1 __brick_first, _ForwardIterator1 __brick_last) { + return std::__pstl_transform<__remove_parallel_policy_t<_ExecutionPolicy>>( + __cpu_backend_tag{}, + __brick_first, + __brick_last, + __first2 + (__brick_first - __first1), + __result + (__brick_first - __first1), + __op); + }); + }); + return __result + (__last1 - __first1); + } else { + return std::transform(__first1, __last1, __first2, __result, __op); + } +} + +_LIBCPP_END_NAMESPACE_STD + +#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17 + +#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKENDS_TRANSFORM_H >From 33b61efe005cb12247cdf89ce7b7a4c5ca849f5b Mon Sep 17 00:00:00 2001 From: AntonRydahl <rydahl2...@gmail.com> Date: Fri, 22 Sep 2023 11:55:53 -0700 Subject: [PATCH 5/5] Changing lambdas to capture by value in std::transform for GPUs --- .../__algorithm/pstl_backends/gpu_backends/transform.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h index 03eba11a3f5f52b..7fcfde44aaaa7a6 100644 --- a/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h +++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h @@ -38,11 +38,13 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> && __has_random_access_iterator_category_or_concept<_ForwardIterator>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + // While the CPU backend captures by reference, [&], that is not valid when + // offloading to the GPU. Therefore we must capture by value, [=]. return std::__par_backend::__parallel_for_simd_2( __first, __last - __first, __result, - [&](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) { + [=](__iter_reference<_ForwardIterator> __in_value, __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in_value); }); } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && @@ -78,12 +80,14 @@ _LIBCPP_HIDE_FROM_ABI _ForwardOutIterator __pstl_transform( __has_random_access_iterator_category_or_concept<_ForwardIterator1>::value && __has_random_access_iterator_category_or_concept<_ForwardIterator2>::value && __has_random_access_iterator_category_or_concept<_ForwardOutIterator>::value) { + // While the CPU backend captures by reference, [&], that is not valid when + // offloading to the GPU. Therefore we must capture by value, [=]. return std::__par_backend::__parallel_for_simd_3( __first1, __last1 - __first1, __first2, __result, - [&](__iter_reference<_ForwardIterator1> __in1, + [=](__iter_reference<_ForwardIterator1> __in1, __iter_reference<_ForwardIterator2> __in2, __iter_reference<_ForwardOutIterator> __out_value) { __out_value = __op(__in1, __in2); }); } else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> && _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits