Hi, Thanks for review @Richard!. I have tried to address most of your comments in this patch. The major updates include optimizing operator[] for masks, find_first_set and find_last_set.
My further comments on some of the pointed out issues are a. regarding the coverage of types supported for sve : Yes, all the types are covered by mapping any type using simple two rules : the size of the type and signedness of it. b. all the operator overloads now use infix operators. For division and remainder, the inactive elements are padded with 1 to avoid undefined behavior. c. isnan is optimized to have only two cases i.e finite_math_only case or case where svcmpuo is used. d. _S_load for masks (bool) now uses svld1 by reinterpret_casting the pointer to uint8_t pointer and then performing a svunpklo. The same optimization is not done for masked_load and stores, as conversion of mask from a higher size type to lower size type is not optimal (sequential). e. _S_unary_minus could not use svneg_x because it does not support unsigned types. f. added specializations for reductions. g. find_first_set and find_last_set are optimized using svclastb. libstdc++-v3/ChangeLog: * include/Makefile.am: Add simd_sve.h. * include/Makefile.in: Add simd_sve.h. * include/experimental/bits/simd.h: Add new SveAbi. * include/experimental/bits/simd_builtin.h: Use __no_sve_deduce_t to support existing Neon Abi. * include/experimental/bits/simd_converter.h: Convert sequentially when sve is available. * include/experimental/bits/simd_detail.h: Define sve specific macro. * include/experimental/bits/simd_math.h: Fallback frexp to execute sequntially when sve is available, to handle fixed_size_simd return type that always uses sve. * include/experimental/simd: Include bits/simd_sve.h. * testsuite/experimental/simd/tests/bits/main.h: Enable testing for sve128, sve256, sve512. * include/experimental/bits/simd_sve.h: New file. Signed-off-by: Srinivas Yadav Singanaboina vasu.srinivasvasu...@gmail.com --- libstdc++-v3/include/Makefile.am | 1 + libstdc++-v3/include/Makefile.in | 1 + libstdc++-v3/include/experimental/bits/simd.h | 131 +- .../include/experimental/bits/simd_builtin.h | 35 +- .../experimental/bits/simd_converter.h | 57 +- .../include/experimental/bits/simd_detail.h | 7 +- .../include/experimental/bits/simd_math.h | 14 +- .../include/experimental/bits/simd_sve.h | 1863 +++++++++++++++++ libstdc++-v3/include/experimental/simd | 3 + .../experimental/simd/tests/bits/main.h | 3 + 10 files changed, 2084 insertions(+), 31 deletions(-) create mode 100644 libstdc++-v3/include/experimental/bits/simd_sve.h diff --git a/libstdc++-v3/include/Makefile.am b/libstdc++-v3/include/Makefile.am index 6209f390e08..1170cb047a6 100644 --- a/libstdc++-v3/include/Makefile.am +++ b/libstdc++-v3/include/Makefile.am @@ -826,6 +826,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/Makefile.in b/libstdc++-v3/include/Makefile.in index 596fa0d2390..bc44582a2da 100644 --- a/libstdc++-v3/include/Makefile.in +++ b/libstdc++-v3/include/Makefile.in @@ -1172,6 +1172,7 @@ experimental_bits_headers = \ ${experimental_bits_srcdir}/simd_neon.h \ ${experimental_bits_srcdir}/simd_ppc.h \ ${experimental_bits_srcdir}/simd_scalar.h \ + ${experimental_bits_srcdir}/simd_sve.h \ ${experimental_bits_srcdir}/simd_x86.h \ ${experimental_bits_srcdir}/simd_x86_conversions.h \ ${experimental_bits_srcdir}/string_view.tcc \ diff --git a/libstdc++-v3/include/experimental/bits/simd.h b/libstdc++-v3/include/experimental/bits/simd.h index 90523ea57dc..d274cd740fe 100644 --- a/libstdc++-v3/include/experimental/bits/simd.h +++ b/libstdc++-v3/include/experimental/bits/simd.h @@ -39,12 +39,16 @@ #include <functional> #include <iosfwd> #include <utility> +#include <algorithm> #if _GLIBCXX_SIMD_X86INTRIN #include <x86intrin.h> #elif _GLIBCXX_SIMD_HAVE_NEON #include <arm_neon.h> #endif +#if _GLIBCXX_SIMD_HAVE_SVE +#include <arm_sve.h> +#endif /** @ingroup ts_simd * @{ @@ -83,6 +87,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double; using __m512i [[__gnu__::__vector_size__(64)]] = long long; #endif +#if _GLIBCXX_SIMD_HAVE_SVE +constexpr inline int __sve_vectorized_size_bytes = __ARM_FEATURE_SVE_BITS / 8; +#else +constexpr inline int __sve_vectorized_size_bytes = 0; +#endif + namespace simd_abi { // simd_abi forward declarations {{{ // implementation details: @@ -108,6 +118,9 @@ template <int _UsedBytes> template <int _UsedBytes> struct _VecBltnBtmsk; +template <int _UsedBytes, int _TotalBytes = __sve_vectorized_size_bytes> + struct _SveAbi; + template <typename _Tp, int _Np> using _VecN = _VecBuiltin<sizeof(_Tp) * _Np>; @@ -123,6 +136,9 @@ template <int _UsedBytes = 64> template <int _UsedBytes = 16> using _Neon = _VecBuiltin<_UsedBytes>; +template <int _UsedBytes = __sve_vectorized_size_bytes> + using _Sve = _SveAbi<_UsedBytes, __sve_vectorized_size_bytes>; + // implementation-defined: using __sse = _Sse<>; using __avx = _Avx<>; @@ -130,6 +146,7 @@ using __avx512 = _Avx512<>; using __neon = _Neon<>; using __neon128 = _Neon<16>; using __neon64 = _Neon<8>; +using __sve = _Sve<>; // standard: template <typename _Tp, size_t _Np, typename...> @@ -250,6 +267,8 @@ constexpr inline bool __support_neon_float = false; #endif +constexpr inline bool __have_sve = _GLIBCXX_SIMD_HAVE_SVE; + #ifdef _ARCH_PWR10 constexpr inline bool __have_power10vec = true; #else @@ -356,12 +375,13 @@ namespace __detail | (__have_avx512vnni << 27) | (__have_avx512vpopcntdq << 28) | (__have_avx512vp2intersect << 29); - else if constexpr (__have_neon) + else if constexpr (__have_neon || __have_sve) return __have_neon | (__have_neon_a32 << 1) | (__have_neon_a64 << 2) | (__have_neon_a64 << 2) - | (__support_neon_float << 3); + | (__support_neon_float << 3) + | (__have_sve << 4); else if constexpr (__have_power_vmx) return __have_power_vmx | (__have_power_vsx << 1) @@ -733,6 +753,16 @@ template <typename _Abi> return _Bytes <= 16 && is_same_v<simd_abi::_VecBuiltin<_Bytes>, _Abi>; } +// }}} +// __is_sve_abi {{{ +template <typename _Abi> + constexpr bool + __is_sve_abi() + { + constexpr auto _Bytes = __abi_bytes_v<_Abi>; + return _Bytes <= __sve_vectorized_size_bytes && is_same_v<simd_abi::_Sve<_Bytes>, _Abi>; + } + // }}} // __make_dependent_t {{{ template <typename, typename _Up> @@ -998,6 +1028,9 @@ template <typename _Tp> template <typename _Tp> using _SimdWrapper64 = _SimdWrapper<_Tp, 64 / sizeof(_Tp)>; +template <typename _Tp, size_t _Width> + struct _SveSimdWrapper; + // }}} // __is_simd_wrapper {{{ template <typename _Tp> @@ -2858,6 +2891,8 @@ template <typename _Tp> constexpr size_t __bytes = __vectorized_sizeof<_Tp>(); if constexpr (__bytes == sizeof(_Tp)) return static_cast<scalar*>(nullptr); + else if constexpr (__have_sve) + return static_cast<_SveAbi<__sve_vectorized_size_bytes>*>(nullptr); else if constexpr (__have_avx512vl || (__have_avx512f && __bytes == 64)) return static_cast<_VecBltnBtmsk<__bytes>*>(nullptr); else @@ -2951,6 +2986,9 @@ template <typename _Tp, typename _Abi = simd_abi::__default_abi<_Tp>> template <typename _Tp, size_t _Np, typename = void> struct __deduce_impl; +template <typename _Tp, size_t _Np, typename = void> + struct __no_sve_deduce_impl; + namespace simd_abi { /** * @tparam _Tp The requested `value_type` for the elements. @@ -2965,6 +3003,12 @@ template <typename _Tp, size_t _Np, typename...> template <typename _Tp, size_t _Np, typename... _Abis> using deduce_t = typename deduce<_Tp, _Np, _Abis...>::type; + +template <typename _Tp, size_t _Np, typename...> + struct __no_sve_deduce : __no_sve_deduce_impl<_Tp, _Np> {}; + +template <typename _Tp, size_t _Np, typename... _Abis> + using __no_sve_deduce_t = typename __no_sve_deduce<_Tp, _Np, _Abis...>::type; } // namespace simd_abi // }}}2 @@ -2974,13 +3018,23 @@ template <typename _Tp, typename _V, typename = void> template <typename _Tp, typename _Up, typename _Abi> struct rebind_simd<_Tp, simd<_Up, _Abi>, - void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>> - { using type = simd<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; }; + void_t<std::conditional_t<!__is_sve_abi<_Abi>(), + simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>, + simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>> + { using type = simd<_Tp, std::conditional_t<!__is_sve_abi<_Abi>(), + simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>, + simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>; + }; template <typename _Tp, typename _Up, typename _Abi> struct rebind_simd<_Tp, simd_mask<_Up, _Abi>, - void_t<simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>> - { using type = simd_mask<_Tp, simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>; }; + void_t<std::conditional_t<!__is_sve_abi<_Abi>(), + simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>, + simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>> + { using type = simd_mask<_Tp, std::conditional_t<!__is_sve_abi<_Abi>(), + simd_abi::__no_sve_deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>, + simd_abi::deduce_t<_Tp, simd_size_v<_Up, _Abi>, _Abi>>>; + }; template <typename _Tp, typename _V> using rebind_simd_t = typename rebind_simd<_Tp, _V>::type; @@ -3243,7 +3297,7 @@ template <typename _Tp, typename _Up, typename _Ap> else if constexpr (_Tp::size() == 1) return __x[0]; else if constexpr (sizeof(_Tp) == sizeof(__x) - && !__is_fixed_size_abi_v<_Ap>) + && !__is_fixed_size_abi_v<_Ap> && !__is_sve_abi<_Ap>()) return {__private_init, __vector_bitcast<typename _Tp::value_type, _Tp::size()>( _Ap::_S_masked(__data(__x))._M_data)}; @@ -4004,18 +4058,29 @@ template <typename _V, typename _Ap, split(const simd<typename _V::value_type, _Ap>& __x) { using _Tp = typename _V::value_type; + + auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __x[__i * _V::size() + __j]; }); + }); + }; + if constexpr (_Parts == 1) { return {simd_cast<_V>(__x)}; } else if (__x._M_is_constprop()) { - return __generate_from_n_evaluations<_Parts, array<_V, _Parts>>( - [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { - return _V([&](auto __j) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA - { return __x[__i * _V::size() + __j]; }); - }); + return __gen_fallback(); } +#if _GLIBCXX_SIMD_HAVE_SVE + else if constexpr(__is_sve_abi<_Ap>) + { + return __gen_fallback(); + } +#endif else if constexpr ( __is_fixed_size_abi_v<_Ap> && (is_same_v<typename _V::abi_type, simd_abi::scalar> @@ -4115,7 +4180,8 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename> constexpr size_t _N0 = _SL::template _S_at<0>(); using _V = __deduced_simd<_Tp, _N0>; - if (__x._M_is_constprop()) + auto __gen_fallback = [&]() constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA + { return __generate_from_n_evaluations<sizeof...(_Sizes), _Tuple>( [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { using _Vi = __deduced_simd<_Tp, _SL::_S_at(__i)>; @@ -4124,6 +4190,14 @@ template <size_t... _Sizes, typename _Tp, typename _Ap, typename> return __x[__offset + __j]; }); }); + }; + + if (__x._M_is_constprop()) + __gen_fallback(); +#if _GLIBCXX_SIMD_HAVE_SVE + else if constexpr (__have_sve) + __gen_fallback(); +#endif else if constexpr (_Np == _N0) { static_assert(sizeof...(_Sizes) == 1); @@ -4510,8 +4584,10 @@ template <template <int> class _A0, template <int> class... _Rest> // 1. The ABI tag is valid for _Tp // 2. The storage overhead is no more than padding to fill the next // power-of-2 number of bytes - if constexpr (_A0<_Bytes>::template _S_is_valid_v< - _Tp> && __fullsize / 2 < _Np) + if constexpr (_A0<_Bytes>::template _S_is_valid_v<_Tp> + && ((__is_sve_abi<_A0<_Bytes>>() && __have_sve && (_Np <= __sve_vectorized_size_bytes/sizeof(_Tp))) + || (__fullsize / 2 < _Np)) + ) return typename __decay_abi<_A0<_Bytes>>::type{}; else { @@ -4536,7 +4612,13 @@ template <template <int> class _A0, template <int> class... _Rest> // the following lists all native ABIs, which makes them accessible to // simd_abi::deduce and select_best_vector_type_t (for fixed_size). Order // matters: Whatever comes first has higher priority. -using _AllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin, +using _AllNativeAbis = _AbiList< +#if _GLIBCXX_SIMD_HAVE_SVE + simd_abi::_SveAbi, +#endif + simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin, __scalar_abi_wrapper>; + +using _NoSveAllNativeAbis = _AbiList<simd_abi::_VecBltnBtmsk, simd_abi::_VecBuiltin, __scalar_abi_wrapper>; // valid _SimdTraits specialization {{{1 @@ -4551,18 +4633,35 @@ template <typename _Tp, size_t _Np> _Tp, _Np, enable_if_t<_AllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>> { using type = _AllNativeAbis::_FirstValidAbi<_Tp, _Np>; }; +template <typename _Tp, size_t _Np> + struct __no_sve_deduce_impl< + _Tp, _Np, enable_if_t<_NoSveAllNativeAbis::template _S_has_valid_abi<_Tp, _Np>>> + { using type = _NoSveAllNativeAbis::_FirstValidAbi<_Tp, _Np>; }; + // fall back to fixed_size only if scalar and native ABIs don't match template <typename _Tp, size_t _Np, typename = void> struct __deduce_fixed_size_fallback {}; +template <typename _Tp, size_t _Np, typename = void> + struct __no_sve_deduce_fixed_size_fallback {}; + template <typename _Tp, size_t _Np> struct __deduce_fixed_size_fallback<_Tp, _Np, enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>> { using type = simd_abi::fixed_size<_Np>; }; +template <typename _Tp, size_t _Np> + struct __no_sve_deduce_fixed_size_fallback<_Tp, _Np, + enable_if_t<simd_abi::fixed_size<_Np>::template _S_is_valid_v<_Tp>>> + { using type = simd_abi::fixed_size<_Np>; }; + template <typename _Tp, size_t _Np, typename> struct __deduce_impl : public __deduce_fixed_size_fallback<_Tp, _Np> {}; +template <typename _Tp, size_t _Np, typename> + struct __no_sve_deduce_impl : public __no_sve_deduce_fixed_size_fallback<_Tp, _Np> {}; + + //}}}1 /// @endcond diff --git a/libstdc++-v3/include/experimental/bits/simd_builtin.h b/libstdc++-v3/include/experimental/bits/simd_builtin.h index 6ccc2fcec9c..bb5d4e3d1c5 100644 --- a/libstdc++-v3/include/experimental/bits/simd_builtin.h +++ b/libstdc++-v3/include/experimental/bits/simd_builtin.h @@ -1614,7 +1614,7 @@ template <typename _Abi, typename> static_assert(_UW_size <= _TV_size); using _UW = _SimdWrapper<_Up, _UW_size>; using _UV = __vector_type_t<_Up, _UW_size>; - using _UAbi = simd_abi::deduce_t<_Up, _UW_size>; + using _UAbi = simd_abi::__no_sve_deduce_t<_Up, _UW_size>; if constexpr (_UW_size == _TV_size) // one convert+store { const _UW __converted = __convert<_UW>(__v); @@ -1857,7 +1857,7 @@ template <typename _Abi, typename> else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, plus<>>) { - using _Ap = simd_abi::deduce_t<_Tp, __full_size>; + using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>; return _Ap::_SimdImpl::_S_reduce( simd<_Tp, _Ap>(__private_init, _Abi::_S_masked(__as_vector(__x))), @@ -1866,7 +1866,7 @@ template <typename _Abi, typename> else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>, multiplies<>>) { - using _Ap = simd_abi::deduce_t<_Tp, __full_size>; + using _Ap = simd_abi::__no_sve_deduce_t<_Tp, __full_size>; using _TW = _SimdWrapper<_Tp, __full_size>; _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector(); @@ -1882,7 +1882,7 @@ template <typename _Abi, typename> } else if constexpr (_Np & 1) { - using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>; + using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np - 1>; return __binary_op( simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce( simd<_Tp, _Ap>( @@ -1936,7 +1936,7 @@ template <typename _Abi, typename> { static_assert(sizeof(__x) > __min_vector_size<_Tp>); static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2 - using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>; + using _Ap = simd_abi::__no_sve_deduce_t<_Tp, _Np / 2>; using _V = simd<_Tp, _Ap>; return _Ap::_SimdImpl::_S_reduce( __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))), @@ -2376,6 +2376,16 @@ template <typename _Abi, typename> _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np> _S_fpclassify(_SimdWrapper<_Tp, _Np> __x) { + if constexpr(__have_sve) + { + __fixed_size_storage_t<int, _Np> __r{}; + __execute_n_times<_Np>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __r._M_set(__i, std::fpclassify(__x[__i])); + }); + return __r; + } + else { using _I = __int_for_sizeof_t<_Tp>; const auto __xn = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x))); @@ -2453,6 +2463,7 @@ template <typename _Abi, typename> })}; else __assert_unreachable<_Tp>(); + } } // _S_increment & _S_decrement{{{2 @@ -2785,11 +2796,23 @@ template <typename _Abi, typename> return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits()); } else + { + if constexpr(__is_sve_abi<_UAbi>()) + { + simd_mask<_Tp> __r(false); + constexpr size_t __min_size = std::min(__r.size(), __x.size()); + __execute_n_times<__min_size>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __r[__i] = __x[__i]; + }); + return __data(__r); + } + else return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>, _S_size<_Tp>>( __data(__x)); } - + } // }}} // _S_masked_load {{{2 template <typename _Tp, size_t _Np> diff --git a/libstdc++-v3/include/experimental/bits/simd_converter.h b/libstdc++-v3/include/experimental/bits/simd_converter.h index 3160e251632..b233d2c70a5 100644 --- a/libstdc++-v3/include/experimental/bits/simd_converter.h +++ b/libstdc++-v3/include/experimental/bits/simd_converter.h @@ -28,6 +28,18 @@ #if __cplusplus >= 201703L _GLIBCXX_SIMD_BEGIN_NAMESPACE + +template <typename _Arg, typename _Ret, typename _To, size_t _Np> +_Ret __converter_fallback(_Arg __a) + { + _Ret __ret{}; + __execute_n_times<_Np>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __ret._M_set(__i, static_cast<_To>(__a[__i])); + }); + return __ret; + } + // _SimdConverter scalar -> scalar {{{ template <typename _From, typename _To> struct _SimdConverter<_From, simd_abi::scalar, _To, simd_abi::scalar, @@ -56,14 +68,16 @@ template <typename _From, typename _To, typename _Abi> }; // }}} -// _SimdConverter "native 1" -> "native 2" {{{ +// _SimdConverter "native non-sve 1" -> "native non-sve 2" {{{ template <typename _From, typename _To, typename _AFrom, typename _ATo> struct _SimdConverter< _From, _AFrom, _To, _ATo, enable_if_t<!disjunction_v< __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>, is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>, - conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>>>> + conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>> + && !(__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>()) + >> { using _Arg = typename _AFrom::template __traits<_From>::_SimdMember; using _Ret = typename _ATo::template __traits<_To>::_SimdMember; @@ -75,6 +89,26 @@ template <typename _From, typename _To, typename _AFrom, typename _ATo> { return __vector_convert<_V>(__a, __more...); } }; +// }}} +// _SimdConverter "native 1" -> "native 2" {{{ +template <typename _From, typename _To, typename _AFrom, typename _ATo> + struct _SimdConverter< + _From, _AFrom, _To, _ATo, + enable_if_t<!disjunction_v< + __is_fixed_size_abi<_AFrom>, __is_fixed_size_abi<_ATo>, + is_same<_AFrom, simd_abi::scalar>, is_same<_ATo, simd_abi::scalar>, + conjunction<is_same<_From, _To>, is_same<_AFrom, _ATo>>> + && (__is_sve_abi<_AFrom>() || __is_sve_abi<_ATo>()) + >> + { + using _Arg = typename _AFrom::template __traits<_From>::_SimdMember; + using _Ret = typename _ATo::template __traits<_To>::_SimdMember; + + _GLIBCXX_SIMD_INTRINSIC constexpr _Ret + operator()(_Arg __x) const noexcept + { return __converter_fallback<_Arg, _Ret, _To, simd_size_v<_To, _ATo>>(__x); } + }; + // }}} // _SimdConverter scalar -> fixed_size<1> {{{1 template <typename _From, typename _To> @@ -111,6 +145,10 @@ template <typename _From, typename _To, int _Np> if constexpr (is_same_v<_From, _To>) return __x; + // fallback to sequential when sve is available + else if constexpr (__have_sve) + return __converter_fallback<_Arg, _Ret, _To, _Np>(__x); + // special case (optimize) int signedness casts else if constexpr (sizeof(_From) == sizeof(_To) && is_integral_v<_From> && is_integral_v<_To>) @@ -275,11 +313,14 @@ template <typename _From, typename _Ap, typename _To, int _Np> "_SimdConverter to fixed_size only works for equal element counts"); using _Ret = __fixed_size_storage_t<_To, _Np>; + using _Arg = typename _SimdTraits<_From, _Ap>::_SimdMember; _GLIBCXX_SIMD_INTRINSIC constexpr _Ret - operator()(typename _SimdTraits<_From, _Ap>::_SimdMember __x) const noexcept + operator()(_Arg __x) const noexcept { - if constexpr (_Ret::_S_tuple_size == 1) + if constexpr (__have_sve) + return __converter_fallback<_Arg, _Ret, _To, _Np>(__x); + else if constexpr (_Ret::_S_tuple_size == 1) return {__vector_convert<typename _Ret::_FirstType::_BuiltinType>(__x)}; else { @@ -316,12 +357,14 @@ template <typename _From, int _Np, typename _To, typename _Ap> "_SimdConverter to fixed_size only works for equal element counts"); using _Arg = __fixed_size_storage_t<_From, _Np>; + using _Ret = typename _SimdTraits<_To, _Ap>::_SimdMember; _GLIBCXX_SIMD_INTRINSIC constexpr - typename _SimdTraits<_To, _Ap>::_SimdMember - operator()(const _Arg& __x) const noexcept + _Ret operator()(const _Arg& __x) const noexcept { - if constexpr (_Arg::_S_tuple_size == 1) + if constexpr(__have_sve) + return __converter_fallback<_Arg, _Ret, _To, _Np>(__x); + else if constexpr (_Arg::_S_tuple_size == 1) return __vector_convert<__vector_type_t<_To, _Np>>(__x.first); else if constexpr (_Arg::_S_is_homogeneous) return __call_with_n_evaluations<_Arg::_S_tuple_size>( diff --git a/libstdc++-v3/include/experimental/bits/simd_detail.h b/libstdc++-v3/include/experimental/bits/simd_detail.h index 1fb77866bb2..52fdf7149bf 100644 --- a/libstdc++-v3/include/experimental/bits/simd_detail.h +++ b/libstdc++-v3/include/experimental/bits/simd_detail.h @@ -61,6 +61,11 @@ #else #define _GLIBCXX_SIMD_HAVE_NEON_A64 0 #endif +#if (__ARM_FEATURE_SVE_BITS > 0 && __ARM_FEATURE_SVE_VECTOR_OPERATORS==1) +#define _GLIBCXX_SIMD_HAVE_SVE 1 +#else +#define _GLIBCXX_SIMD_HAVE_SVE 0 +#endif //}}} // x86{{{ #ifdef __MMX__ @@ -267,7 +272,7 @@ #define _GLIBCXX_SIMD_IS_UNLIKELY(__x) __builtin_expect(__x, 0) #define _GLIBCXX_SIMD_IS_LIKELY(__x) __builtin_expect(__x, 1) -#if __STRICT_ANSI__ || defined __clang__ +#if _GLIBCXX_SIMD_HAVE_SVE || __STRICT_ANSI__ || defined __clang__ #define _GLIBCXX_SIMD_CONSTEXPR #define _GLIBCXX_SIMD_USE_CONSTEXPR_API const #else diff --git a/libstdc++-v3/include/experimental/bits/simd_math.h b/libstdc++-v3/include/experimental/bits/simd_math.h index c91f05fceb3..0e62a7f1650 100644 --- a/libstdc++-v3/include/experimental/bits/simd_math.h +++ b/libstdc++-v3/include/experimental/bits/simd_math.h @@ -652,6 +652,18 @@ template <typename _Tp, typename _Abi, typename = __detail::__odr_helper> (*__exp)[0] = __tmp; return __r; } + else if constexpr (__is_sve_abi<_Abi>()) + { + simd<_Tp, _Abi> __r; + __execute_n_times<simd_size_v<_Tp, _Abi>>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + int __tmp; + const auto __ri = std::frexp(__x[__i], &__tmp); + (*__exp)[__i] = __tmp; + __r[__i] = __ri; + }); + return __r; + } else if constexpr (__is_fixed_size_abi_v<_Abi>) return {__private_init, _Abi::_SimdImpl::_S_frexp(__data(__x), __data(*__exp))}; #if _GLIBCXX_SIMD_X86INTRIN @@ -1135,7 +1147,7 @@ _GLIBCXX_SIMD_CVTING2(hypot) _GLIBCXX_SIMD_USE_CONSTEXPR_API _V __inf(__infinity_v<_Tp>); #ifndef __FAST_MATH__ - if constexpr (_V::size() > 1 && __have_neon && !__have_neon_a32) + if constexpr (_V::size() > 1 && (__is_neon_abi<_Abi>() && __have_neon && !__have_neon_a32)) { // With ARMv7 NEON, we have no subnormals and must use slightly // different strategy const _V __hi_exp = __hi & __inf; diff --git a/libstdc++-v3/include/experimental/bits/simd_sve.h b/libstdc++-v3/include/experimental/bits/simd_sve.h new file mode 100644 index 00000000000..123242a3a62 --- /dev/null +++ b/libstdc++-v3/include/experimental/bits/simd_sve.h @@ -0,0 +1,1863 @@ +// Simd SVE specific implementations -*- C++ -*- + +// Copyright The GNU Toolchain Authors. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// Under Section 7 of GPL version 3, you are granted additional +// permissions described in the GCC Runtime Library Exception, version +// 3.1, as published by the Free Software Foundation. + +// You should have received a copy of the GNU General Public License and +// a copy of the GCC Runtime Library Exception along with this program; +// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +// <http://www.gnu.org/licenses/>. + + +#ifndef _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_ +#define _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_ + +#if __cplusplus >= 201703L + +#if !_GLIBCXX_SIMD_HAVE_SVE +#error "simd_sve.h may only be included when SVE on ARM is available" +#endif + +_GLIBCXX_SIMD_BEGIN_NAMESPACE + +// Helper function mapping to sve supported types +template <typename _Tp> + constexpr auto + __get_sve_value_type() + { + if constexpr (is_integral_v<_Tp>) + { + if constexpr (is_signed_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 1) + return int8_t{}; + else if constexpr (sizeof(_Tp) == 2) + return int16_t{}; + else if constexpr (sizeof(_Tp) == 4) + return int32_t{}; + else if constexpr (sizeof(_Tp) == 8) + return int64_t{}; + else + return _Tp{}; + } + else + { + if constexpr (sizeof(_Tp) == 1) + return uint8_t{}; + else if constexpr (sizeof(_Tp) == 2) + return uint16_t{}; + else if constexpr (sizeof(_Tp) == 4) + return uint32_t{}; + else if constexpr (sizeof(_Tp) == 8) + return uint64_t{}; + else + return _Tp{}; + } + } + else + { + if constexpr (is_floating_point_v<_Tp>) + { + if constexpr (sizeof(_Tp) == 4) + return float32_t{}; + else if constexpr (sizeof(_Tp) == 8) + return float64_t{}; + else + return _Tp{}; + } + } + } + +template <typename _Tp> + using __get_sve_value_type_t = decltype(__get_sve_value_type<_Tp>()); + +typedef svbool_t __sve_bool_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + +template <typename _Tp, size_t _Np> + struct __sve_vector_type + {}; + +template <typename _Tp, size_t _Np> + using __sve_vector_type_t = typename __sve_vector_type<_Tp, _Np>::type; + +template <size_t _Np> + struct __sve_vector_type<int8_t, _Np> + { + typedef svint8_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(int8_t __dup) + { return svdup_s8(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b8(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<uint8_t, _Np> + { + typedef svuint8_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(uint8_t __dup) + { return svdup_u8(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b8(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<int16_t, _Np> + { + typedef svint16_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(int16_t __dup) + { return svdup_s16(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b16(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<uint16_t, _Np> + { + typedef svuint16_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(uint16_t __dup) + { return svdup_u16(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b16(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<int32_t, _Np> + { + typedef svint32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(int32_t __dup) + { return svdup_s32(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b32(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<uint32_t, _Np> + { + typedef svuint32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(uint32_t __dup) + { return svdup_u32(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b32(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<int64_t, _Np> + { + typedef svint64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(int64_t __dup) + { return svdup_s64(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b64(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<uint64_t, _Np> + { + typedef svuint64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(uint64_t __dup) + { return svdup_u64(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b64(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<float, _Np> + { + typedef svfloat32_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(float __dup) + { return svdup_f32(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b32(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<double, _Np> + { + typedef svfloat64_t __sve_vlst_type __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static __sve_vlst_type + __sve_broadcast(double __dup) + { return svdup_f64(__dup); } + + inline static __sve_bool_type + __sve_active_mask() + { return svwhilelt_b64(size_t(0), _Np); }; + + using type = __sve_vlst_type; + }; + +template <size_t _Np> + struct __sve_vector_type<char, _Np> + : __sve_vector_type<__get_sve_value_type_t<char>, _Np> + {}; + +template <size_t _Np> + struct __sve_vector_type<char16_t, _Np> + : __sve_vector_type<__get_sve_value_type_t<char16_t>, _Np> + {}; + +template <size_t _Np> + struct __sve_vector_type<wchar_t, _Np> + : __sve_vector_type<__get_sve_value_type_t<wchar_t>, _Np> + {}; + +template <size_t _Np> + struct __sve_vector_type<char32_t, _Np> + : __sve_vector_type<__get_sve_value_type_t<char32_t>, _Np> + {}; + +template <size_t _Np> + struct __sve_vector_type<long long int, _Np> + : __sve_vector_type<__get_sve_value_type_t<long long int>, _Np> + {}; + +template <size_t _Np> + struct __sve_vector_type<long long unsigned int, _Np> + : __sve_vector_type<__get_sve_value_type_t<long long unsigned int>, _Np> + {}; + +template <size_t _Size> + struct __sve_mask_type + { + static_assert((_Size & (_Size - 1)) != 0, "This trait may only be used for non-power-of-2 " + "sizes. Power-of-2 sizes must be specialized."); + + using type = typename __sve_mask_type<std::__bit_ceil(_Size)>::type; + }; + +template <size_t _Size> + using __sve_mask_type_t = typename __sve_mask_type<_Size>::type; + +template <> + struct __sve_mask_type<1> + { + using type = __sve_bool_type; + + using __sve_mask_uint_type = uint8_t; + + typedef svuint8_t __sve_mask_vector_type + __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static auto + __sve_mask_active_count(type __active_mask, type __pred) + { return svcntp_b8(__active_mask, __pred); } + + inline static type + __sve_mask_first_true() + { return svptrue_pat_b8(SV_VL1); } + + inline static type + __sve_mask_next_true(type __active_mask, type __pred) + { return svpnext_b8(__active_mask, __pred); } + + inline static bool + __sve_mask_get(type __active_mask, size_t __i) + { return __sve_mask_vector_type(svdup_u8_z(__active_mask, 1))[__i] != 0;} + + inline static const __sve_mask_vector_type __index0123 = svindex_u8(0, 1); + }; + +template <> + struct __sve_mask_type<2> + { + using type = __sve_bool_type; + + using __sve_mask_uint_type = uint16_t; + + typedef svuint16_t __sve_mask_vector_type + __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static auto + __sve_mask_active_count(type __active_mask, type __pred) + { return svcntp_b16(__active_mask, __pred); } + + inline static type + __sve_mask_first_true() + { return svptrue_pat_b16(SV_VL1); } + + inline static type + __sve_mask_next_true(type __active_mask, type __pred) + { return svpnext_b16(__active_mask, __pred); } + + inline static bool + __sve_mask_get(type __active_mask, size_t __i) + { return __sve_mask_vector_type(svdup_u16_z(__active_mask, 1))[__i] != 0;} + + inline static const __sve_mask_vector_type __index0123 = svindex_u16(0, 1); + }; + +template <> + struct __sve_mask_type<4> + { + using type = __sve_bool_type; + + using __sve_mask_uint_type = uint32_t; + + typedef svuint32_t __sve_mask_vector_type + __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static auto + __sve_mask_active_count(type __active_mask, type __pred) + { return svcntp_b32(__active_mask, __pred); } + + inline static type + __sve_mask_first_true() + { return svptrue_pat_b32(SV_VL1); } + + inline static type + __sve_mask_next_true(type __active_mask, type __pred) + { return svpnext_b32(__active_mask, __pred); } + + inline static bool + __sve_mask_get(type __active_mask, size_t __i) + { return __sve_mask_vector_type(svdup_u32_z(__active_mask, 1))[__i] != 0;} + + inline static const __sve_mask_vector_type __index0123 = svindex_u32(0, 1); + }; + +template <> + struct __sve_mask_type<8> + { + using type = __sve_bool_type; + + using __sve_mask_uint_type = uint64_t; + + typedef svuint64_t __sve_mask_vector_type + __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS))); + + inline static auto + __sve_mask_active_count(type __active_mask, type __pred) + { return svcntp_b64(__active_mask, __pred); } + + inline static type + __sve_mask_first_true() + { return svptrue_pat_b64(SV_VL1); } + + inline static type + __sve_mask_next_true(type __active_mask, type __pred) + { return svpnext_b64(__active_mask, __pred); } + + inline static bool + __sve_mask_get(type __active_mask, size_t __i) + { return __sve_mask_vector_type(svdup_u64_z(__active_mask, 1))[__i] != 0;} + + inline static const __sve_mask_vector_type __index0123 = svindex_u64(0, 1); + }; + +template <typename _To, typename _From> + _GLIBCXX_SIMD_INTRINSIC constexpr auto + __sve_reinterpret_cast(_From __v) + { + if constexpr (std::is_same_v<_To, int32_t>) + return svreinterpret_s32(__v); + else if constexpr (std::is_same_v<_To, int64_t>) + return svreinterpret_s64(__v); + else if constexpr (std::is_same_v<_To, float32_t>) + return svreinterpret_f32(__v); + else if constexpr (std::is_same_v<_To, float64_t>) + return svreinterpret_f64(__v); + else + __assert_unreachable<_To>(); // add more cases if needed. + } + +template <typename _Tp, size_t _Width> + struct _SveSimdWrapper + { + static_assert(__is_vectorizable_v<_Tp>); + + static_assert(_Width >= 2); // 1 doesn't make sense, use _Tp directly then + + using _BuiltinType = __sve_vector_type_t<_Tp, _Width>; + + using value_type = _Tp; + + static inline constexpr size_t _S_full_size = sizeof(_BuiltinType) / sizeof(value_type); + + static inline constexpr int _S_size = _Width; + + static inline constexpr bool _S_is_partial = _S_full_size != _S_size; + + _BuiltinType _M_data; + + _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper<_Tp, _S_full_size> + __as_full_vector() const + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveSimdWrapper(initializer_list<_Tp> __init) + : _M_data(__generate_from_n_evaluations<_Width, _BuiltinType>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __init.begin()[__i.value]; + })) + {} + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveSimdWrapper() = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveSimdWrapper(const _SveSimdWrapper&) = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveSimdWrapper(_SveSimdWrapper&&) = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper& + operator=(const _SveSimdWrapper&) = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr _SveSimdWrapper& + operator=(_SveSimdWrapper&&) = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveSimdWrapper(__sve_vector_type_t<_Tp, _Width> __x) + : _M_data(__x) + {} + + template <typename... _As, typename = enable_if_t<((is_same_v<simd_abi::scalar, _As> && ...) + && sizeof...(_As) <= _Width)>> + _GLIBCXX_SIMD_INTRINSIC constexpr + operator _SimdTuple<_Tp, _As...>() const + { + return __generate_from_n_evaluations<sizeof...(_As), _SimdTuple<_Tp, _As...>>( + [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return _M_data[int(__i)]; + }); + } + + _GLIBCXX_SIMD_INTRINSIC constexpr + operator const _BuiltinType&() const + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC constexpr + operator _BuiltinType&() + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC constexpr _Tp + operator[](size_t __i) const + { return _M_data[__i]; } + + template <size_t __i> + _GLIBCXX_SIMD_INTRINSIC constexpr _Tp + operator[](_SizeConstant<__i>) const + { return _M_data[__i]; } + + _GLIBCXX_SIMD_INTRINSIC constexpr void + _M_set(size_t __i, _Tp __x) + { + _M_data[__i] = __x; + } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop() const + { return false; } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop_none_of() const + { return false; } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop_all_of() const + { return false; } + }; + +template <size_t _Bits, size_t _Width> + struct _SveMaskWrapper + { + using _BuiltinSveMaskType = __sve_mask_type<_Bits>; + + using _BuiltinSveVectorType = __sve_vector_type<__int_with_sizeof_t<_Bits>, _Width>; + + using _BuiltinType = typename _BuiltinSveMaskType::type; + + using value_type = bool; + + static constexpr size_t _S_full_size = sizeof(_BuiltinType); + + _GLIBCXX_SIMD_INTRINSIC constexpr _SveMaskWrapper<_Bits, _S_full_size> + __as_full_vector() const + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveMaskWrapper() = default; + + _GLIBCXX_SIMD_INTRINSIC constexpr + _SveMaskWrapper(_BuiltinType __k) + : _M_data(__k) + {}; + + _GLIBCXX_SIMD_INTRINSIC + operator const _BuiltinType&() const + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC + operator _BuiltinType&() + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC _BuiltinType + __intrin() const + { return _M_data; } + + _GLIBCXX_SIMD_INTRINSIC constexpr value_type + operator[](size_t __i) const + { + return _BuiltinSveMaskType::__sve_mask_get(_M_data, __i); + } + + template <size_t __i> + _GLIBCXX_SIMD_INTRINSIC constexpr value_type + operator[](_SizeConstant<__i>) const + { + return _BuiltinSveMaskType::__sve_mask_get(_M_data, __i); + } + + _GLIBCXX_SIMD_INTRINSIC constexpr void + _M_set(size_t __i, value_type __x) + { + _BuiltinType __index + = svcmpeq(_BuiltinSveVectorType::__sve_active_mask(), _BuiltinSveMaskType::__index0123, + typename _BuiltinSveMaskType::__sve_mask_uint_type(__i)); + + if (__x) + _M_data = svorr_z(_BuiltinSveVectorType::__sve_active_mask(), _M_data, __index); + else + _M_data = svbic_z(_BuiltinSveVectorType::__sve_active_mask(), _M_data, __index); + } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop() const + { return false; } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop_none_of() const + { return false; } + + _GLIBCXX_SIMD_INTRINSIC constexpr bool + _M_is_constprop_all_of() const + { return false; } + + _BuiltinType _M_data; + }; + +struct _CommonImplSve; + +template <typename _Abi, typename = __detail::__odr_helper> + struct _SimdImplSve; + +template <typename _Abi, typename = __detail::__odr_helper> + struct _MaskImplSve; + +template <int _UsedBytes, int> + struct simd_abi::_SveAbi + { + template <typename _Tp> + static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp); + + struct _IsValidAbiTag + : __bool_constant<(_UsedBytes > 1)> + {}; + + template <typename _Tp> + struct _IsValidSizeFor + : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1 && _UsedBytes % sizeof(_Tp) == 0 + && _UsedBytes <= __sve_vectorized_size_bytes)> + {}; + + template <typename _Tp> + struct _IsValid + : conjunction<_IsValidAbiTag, __bool_constant<__have_sve>, + __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>, + _IsValidSizeFor<_Tp>> + {}; + + template <typename _Tp> + static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value; + + using _CommonImpl = _CommonImplSve; + + using _SimdImpl = _SimdImplSve<_SveAbi<_UsedBytes>>; + + using _MaskImpl = _MaskImplSve<_SveAbi<_UsedBytes>>; + + template <typename _Tp> + using _MaskMember = _SveMaskWrapper<sizeof(_Tp), _S_size<_Tp>>; + + template <typename _Tp, bool = _S_is_valid_v<_Tp>> + struct __traits : _InvalidTraits + {}; + + template <typename _Tp> + struct __traits<_Tp, true> + { + using _IsValid = true_type; + using _SimdImpl = _SimdImplSve<_SveAbi<_UsedBytes>>; + using _MaskImpl = _MaskImplSve<_SveAbi<_UsedBytes>>; + + using _SimdMember = _SveSimdWrapper<_Tp, _S_size<_Tp>>; // sve vector type + using _MaskMember = _SveMaskWrapper<sizeof(_Tp), _S_size<_Tp>>; // sve mask type + + static constexpr size_t _S_simd_align = alignof(_SimdMember); + static constexpr size_t _S_mask_align = alignof(_MaskMember); + + static constexpr size_t _S_full_size = _SimdMember::_S_full_size; + static constexpr bool _S_is_partial = _SimdMember::_S_is_partial; + + struct _SimdBase + { + _GLIBCXX_SIMD_ALWAYS_INLINE explicit + operator __sve_vector_type_t<_Tp, _S_size<_Tp>>() const + { return __data(*static_cast<const simd<_Tp, _SveAbi<_UsedBytes>>*>(this)); } + }; + + class _SimdCastType + { + using _Ap = __sve_vector_type_t<_Tp, _S_size<_Tp>>; + + _SimdMember _M_data; + + public: + _GLIBCXX_SIMD_ALWAYS_INLINE constexpr + _SimdCastType(_Ap __a) + : _M_data(__a) + {} + + _GLIBCXX_SIMD_ALWAYS_INLINE constexpr + operator _SimdMember() const + { return _M_data; } + }; + + struct _MaskBase + { + _GLIBCXX_SIMD_ALWAYS_INLINE explicit + operator __sve_mask_type_t<sizeof(_Tp)>() const + { + return __data(*static_cast<const simd_mask<_Tp, _SveAbi<_UsedBytes>>*>(this)); + } + }; + + class _MaskCastType + { + using _Ap = __sve_mask_type_t<sizeof(_Tp)>; + + _Ap _M_data; + + public: + _GLIBCXX_SIMD_ALWAYS_INLINE constexpr + _MaskCastType(_Ap __a) + : _M_data(__a) + {} + + _GLIBCXX_SIMD_ALWAYS_INLINE constexpr + operator _MaskMember() const + { return _M_data; } + }; + }; + + template <typename _Tp> + static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size; + + template <typename _Tp> + static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial; + }; + +template <typename _Tp, size_t _Np> + using __sve_mask = __sve_mask_type<sizeof(_Tp)>; + +struct _CommonImplSve +{ + // _S_converts_via_decomposition + // This lists all cases where a __vector_convert needs to fall back to + // conversion of individual scalars (i.e. decompose the input vector into + // scalars, convert, compose output vector). In those cases, _S_masked_load & + // _S_masked_store prefer to use the _S_bit_iteration implementation. + template <typename _From, typename _To, size_t _ToSize> + static inline constexpr bool __converts_via_decomposition_v = sizeof(_From) != sizeof(_To); + + template <typename _Tp, typename _Up, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_load(const _Up* __p, _SveMaskWrapper<sizeof(_Tp), _Np> __k) + { + using _STp = __get_sve_value_type_t<_Tp>; + using _SUp = __get_sve_value_type_t<_Up>; + using _V = __sve_vector_type_t<_Tp, _Np>; + const _SUp* __up = reinterpret_cast<const _SUp*>(__p); + + if constexpr (std::is_same_v<_Tp, _Up>) + return _V(svld1(__k._M_data, __up)); + if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up> + && (sizeof(_Tp) > sizeof(_Up))) + { + if constexpr (std::is_same_v<_SUp, int8_t>) + { + if constexpr (std::is_same_v<_STp, int16_t>) + return _V(svld1sb_s16(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint16_t>) + return _V(svld1sb_u16(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int32_t>) + return _V(svld1sb_s32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint32_t>) + return _V(svld1sb_u32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1sb_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1sb_u64(__k._M_data, __up)); + } + if constexpr (std::is_same_v<_SUp, uint8_t>) + { + if constexpr (std::is_same_v<_STp, int16_t>) + return _V(svld1ub_s16(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint16_t>) + return _V(svld1ub_u16(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int32_t>) + return _V(svld1ub_s32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint32_t>) + return _V(svld1ub_u32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1ub_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1ub_u64(__k._M_data, __up)); + } + if constexpr (std::is_same_v<_SUp, int16_t>) + { + if constexpr (std::is_same_v<_STp, int32_t>) + return _V(svld1sh_s32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint32_t>) + return _V(svld1sh_u32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1sh_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1sh_u64(__k._M_data, __up)); + } + if constexpr (std::is_same_v<_SUp, uint16_t>) + { + if constexpr (std::is_same_v<_STp, int32_t>) + return _V(svld1uh_s32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint32_t>) + return _V(svld1uh_u32(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1uh_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1uh_u64(__k._M_data, __up)); + } + if constexpr (std::is_same_v<_SUp, int32_t>) + { + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1sw_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1sw_u64(__k._M_data, __up)); + } + if constexpr (std::is_same_v<_SUp, uint32_t>) + { + if constexpr (std::is_same_v<_STp, int64_t>) + return _V(svld1uw_s64(__k._M_data, __up)); + if constexpr (std::is_same_v<_STp, uint64_t>) + return _V(svld1uw_u64(__k._M_data, __up)); + } + } + return __generate_from_n_evaluations<_Np, __sve_vector_type_t<_Tp, _Np>>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + return __k[__i] ? static_cast<_Tp>(__p[__i]) : _Tp{}; + }); + } + + template <typename _Tp, typename _Up, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_store(_Up* __p, _SveSimdWrapper<_Tp, _Np> __x, _SveMaskWrapper<sizeof(_Tp), _Np> __k) + { + using _SUp = __get_sve_value_type_t<_Up>; + using _STp = __get_sve_value_type_t<_Tp>; + + _SUp* __up = reinterpret_cast<_SUp*>(__p); + + if constexpr (std::is_same_v<_Tp, _Up>) + return svst1(__k._M_data, __up, __x); + if constexpr (std::is_integral_v<_Tp> && std::is_integral_v<_Up> + && (sizeof(_Tp) > sizeof(_Up))) + { + if constexpr (std::is_same_v<_SUp, int8_t> && std::is_signed_v<_STp>) + return svst1b(__k._M_data, __up, __x); + if constexpr (std::is_same_v<_SUp, uint8_t> && std::is_unsigned_v<_STp>) + return svst1b(__k._M_data, __up, __x); + if constexpr (std::is_same_v<_SUp, int16_t> && std::is_signed_v<_STp>) + return svst1h(__k._M_data, __up, __x); + if constexpr (std::is_same_v<_SUp, uint16_t> && std::is_unsigned_v<_STp>) + return svst1h(__k._M_data, __up, __x); + if constexpr (std::is_same_v<_SUp, int32_t> && std::is_signed_v<_STp>) + return svst1w(__k._M_data, __up, __x); + if constexpr (std::is_same_v<_SUp, uint32_t> && std::is_unsigned_v<_STp>) + return svst1w(__k._M_data, __up, __x); + } + + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if (__k[__i]) + __p[__i] = static_cast<_Up>(__x[__i]); + }); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_blend(_SveMaskWrapper<sizeof(_Tp), _Np> __k, _SveSimdWrapper<_Tp, _Np> __at0, + _SveSimdWrapper<_Tp, _Np> __at1) + { return svsel(__k._M_data, __at1._M_data, __at0._M_data); } + + template <size_t _Np, bool _Sanitized> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem) + { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __mem[__i] = __x[__i]; + }); + } +}; + +template <typename _Abi, typename> + struct _SimdImplSve + { + template <typename _Tp> + using _MaskMember = typename _Abi::template _MaskMember<_Tp>; + + template <typename _Tp> + using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember; + + using _CommonImpl = typename _Abi::_CommonImpl; + using _SuperImpl = typename _Abi::_SimdImpl; + using _MaskImpl = typename _Abi::_MaskImpl; + + template <typename _Tp> + static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>; + + template <typename _Tp> + static constexpr size_t _S_size = _Abi::template _S_size<_Tp>; + + template <typename _Tp> + using _TypeTag = _Tp*; + + using abi_type = _Abi; + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr auto + _S_broadcast(_Tp __x) noexcept + { + return __sve_vector_type<_Tp, __sve_vectorized_size_bytes / sizeof(_Tp)> + ::__sve_broadcast(__x); + } + + template <typename _Fp, typename _Tp> + inline static constexpr _SimdMember<_Tp> + _S_generator(_Fp&& __gen, _TypeTag<_Tp>) + { + constexpr size_t _Np = _S_size<_Tp>; + _SveSimdWrapper<_Tp, _Np> __ret; + __execute_n_times<_S_size<_Tp>>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __ret._M_set(__i, __gen(__i)); }); + return __ret; + } + + template <typename _Tp, typename _Up> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp> + _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + _SimdMember<_Tp> __ret = _CommonImpl::template _S_load<_Tp, _Up, _Np>( + __mem, _SveMaskWrapper<sizeof(_Tp), _Np>{ + __sve_vector_type<_Tp, _Np>::__sve_active_mask()}); + return __ret; + } + + template <typename _Tp, size_t _Np, typename _Up> + static constexpr inline _SveSimdWrapper<_Tp, _Np> + _S_masked_load(_SveSimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k, const _Up* __mem) + noexcept + { + __sve_vector_type_t<_Tp, _Np> __v + = _CommonImpl::template _S_load<_Tp, _Up, _Np>(__mem, __k); + __sve_vector_type_t<_Tp, _Np> __ret = svsel(__k._M_data, __v, __merge._M_data); + return __ret; + } + + template <typename _Tp, typename _Up> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + _CommonImpl::template _S_store<_Tp, _Up, _Np>( + __mem, __v, __sve_vector_type<_Tp, _Np>::__sve_active_mask()); + } + + template <typename _Tp, typename _Up, size_t _Np> + static constexpr inline void + _S_masked_store(const _SveSimdWrapper<_Tp, _Np> __v, _Up* __mem, + const _SveMaskWrapper<sizeof(_Tp), _Np> __k) noexcept + { _CommonImpl::template _S_store<_Tp, _Up, _Np>(__mem, __v, __k); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_negate(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { + return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, + __sve_vector_type<_Tp, _Np>::__sve_broadcast(_Tp{})); + } + + template <typename _Tp, typename _BinaryOperation> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op) + { + auto __x_data = __x._M_data; + constexpr size_t _Np = simd_size_v<_Tp, _Abi>; + using __sve_vec_t = __sve_vector_type_t<_Tp, _Np>; + std::size_t __i = __x.size(); + for (; (__i % 2) != 1; __i /= 2) + { + __x_data = __binary_op(simd<_Tp, _Abi>( + __private_init, _SveSimdWrapper<_Tp, _Np>( + __sve_vec_t(svuzp1(__x_data, __x_data)))), + simd<_Tp, _Abi>( + __private_init, _SveSimdWrapper<_Tp, _Np>( + __sve_vec_t(svuzp2(__x_data, __x_data)))) + )._M_data; + } + _Tp __res = __x_data[0]; + for (size_t __ri = 1; __ri != __i; __ri++) + __res = __binary_op(__x_data[__ri], __res); + return __res; + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, plus<>) + { + return svaddv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, bit_and<>) + { + return svandv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, bit_or<>) + { + return svorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, bit_xor<>) + { + return sveorv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, __detail::_Maximum()) + { + return svmaxv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp + _S_reduce(simd<_Tp, _Abi> __x, __detail::_Minimum()) + { + return svminv(__sve_vector_type<_Tp, _S_size<_Tp>>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr + __sve_vector_type_t<_Tp, _Np> + _S_min(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b) + { + return svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr + __sve_vector_type_t<_Tp, _Np> + _S_max(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b) + { + return svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr + pair<_SveSimdWrapper<_Tp, _Np>, _SveSimdWrapper<_Tp, _Np>> + _S_minmax(_SveSimdWrapper<_Tp, _Np> __a, _SveSimdWrapper<_Tp, _Np> __b) + { + return { + svmin_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data), + svmax_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __a._M_data, __b._M_data) + }; + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_complement(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { + if constexpr (is_floating_point_v<_Tp>) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + return __sve_reinterpret_cast<_Tp>( + svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __sve_reinterpret_cast<_Ip>(__x))); + } + else + return svnot_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np> + _S_unary_minus(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { + return svmul_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, + static_cast<_Tp>(-1)); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_plus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data + __y._M_data; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_minus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data - __y._M_data; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_multiplies(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data * __y._M_data; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_divides(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + __sve_vector_type_t<_Tp, _Np> __y_padded = svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); + return __x._M_data / __y_padded; + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_modulus(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + __sve_vector_type_t<_Tp, _Np> __y_padded = svsel(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __y._M_data, __sve_vector_type<_Tp, _Np>::__sve_broadcast(1)); + return __x._M_data % __y_padded; + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_bit_and(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + if constexpr (is_floating_point_v<_Tp>) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + return __sve_reinterpret_cast<_Tp>( + svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y))); + } + else + return svand_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_bit_or(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + if constexpr (is_floating_point_v<_Tp>) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + return __sve_reinterpret_cast<_Tp>( + svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y))); + } + else + return svorr_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_bit_xor(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + if constexpr (is_floating_point_v<_Tp>) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + return __sve_reinterpret_cast<_Tp>( + sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __sve_reinterpret_cast<_Ip>(__x), __sve_reinterpret_cast<_Ip>(__y))); + } + else + return sveor_x(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np> + _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data << __y._M_data; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static __sve_vector_type_t<_Tp, _Np> + _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { return __x._M_data >> __y._M_data; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_bit_shift_left(_SveSimdWrapper<_Tp, _Np> __x, int __y) + { return __x._M_data << __y; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr __sve_vector_type_t<_Tp, _Np> + _S_bit_shift_right(_SveSimdWrapper<_Tp, _Np> __x, int __y) + { return __x._M_data >> __y; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_increment(_SveSimdWrapper<_Tp, _Np>& __x) + { __x = __x._M_data + 1; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_decrement(_SveSimdWrapper<_Tp, _Np>& __x) + { __x = __x._M_data - 1; } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svcmpeq(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_not_equal_to(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svcmpne(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_less(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svcmplt(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_less_equal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svcmple(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + // simd.math +#define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \ + template <typename _Tp, size_t _Np, typename... _More> \ + static _SveSimdWrapper<_Tp, _Np> _S_##__name(const _SveSimdWrapper<_Tp, _Np>& __x, \ + const _More&... __more) \ + { \ + _SveSimdWrapper<_Tp, _Np> __r; \ + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + __r._M_set(__i, __name(__x[__i], __more[__i]...)); \ + }); \ + return __r; \ + } + +#define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \ + template <typename _Tp, typename... _More> \ + static auto _S_##__name(const _Tp& __x, const _More&... __more) \ + { \ + return __fixed_size_storage_t<_RetTp, _Tp::_S_size>::_S_generate( \ + [&](auto __meta) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __meta._S_generator( \ + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \ + return __name(__x[__meta._S_offset + __i], \ + __more[__meta._S_offset + __i]...); \ + }, static_cast<_RetTp*>(nullptr)); \ + }); \ + } + + _GLIBCXX_SIMD_MATH_FALLBACK(acos) + _GLIBCXX_SIMD_MATH_FALLBACK(asin) + _GLIBCXX_SIMD_MATH_FALLBACK(atan) + _GLIBCXX_SIMD_MATH_FALLBACK(atan2) + _GLIBCXX_SIMD_MATH_FALLBACK(cos) + _GLIBCXX_SIMD_MATH_FALLBACK(sin) + _GLIBCXX_SIMD_MATH_FALLBACK(tan) + _GLIBCXX_SIMD_MATH_FALLBACK(acosh) + _GLIBCXX_SIMD_MATH_FALLBACK(asinh) + _GLIBCXX_SIMD_MATH_FALLBACK(atanh) + _GLIBCXX_SIMD_MATH_FALLBACK(cosh) + _GLIBCXX_SIMD_MATH_FALLBACK(sinh) + _GLIBCXX_SIMD_MATH_FALLBACK(tanh) + _GLIBCXX_SIMD_MATH_FALLBACK(exp) + _GLIBCXX_SIMD_MATH_FALLBACK(exp2) + _GLIBCXX_SIMD_MATH_FALLBACK(expm1) + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb) + _GLIBCXX_SIMD_MATH_FALLBACK(log) + _GLIBCXX_SIMD_MATH_FALLBACK(log10) + _GLIBCXX_SIMD_MATH_FALLBACK(log1p) + _GLIBCXX_SIMD_MATH_FALLBACK(log2) + _GLIBCXX_SIMD_MATH_FALLBACK(logb) + + // modf implemented in simd_math.h + _GLIBCXX_SIMD_MATH_FALLBACK(scalbn) + _GLIBCXX_SIMD_MATH_FALLBACK(scalbln) + _GLIBCXX_SIMD_MATH_FALLBACK(cbrt) + _GLIBCXX_SIMD_MATH_FALLBACK(pow) + _GLIBCXX_SIMD_MATH_FALLBACK(erf) + _GLIBCXX_SIMD_MATH_FALLBACK(erfc) + _GLIBCXX_SIMD_MATH_FALLBACK(lgamma) + _GLIBCXX_SIMD_MATH_FALLBACK(tgamma) + + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint) + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint) + + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround) + _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround) + + _GLIBCXX_SIMD_MATH_FALLBACK(fmod) + _GLIBCXX_SIMD_MATH_FALLBACK(remainder) + + template <typename _Tp, size_t _Np> + static _SveSimdWrapper<_Tp, _Np> + _S_remquo(const _SveSimdWrapper<_Tp, _Np> __x, const _SveSimdWrapper<_Tp, _Np> __y, + __fixed_size_storage_t<int, _Np>* __z) + { + _SveSimdWrapper<_Tp, _Np> __r{}; + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + int __tmp; + __r._M_set(__i, remquo(__x[__i], __y[__i], &__tmp)); + __z->_M_set(__i, __tmp); + }); + return __r; + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np> + _S_fpclassify(_SveSimdWrapper<_Tp, _Np> __x) + { + __fixed_size_storage_t<int, _Np> __r{}; + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + __r._M_set(__i, std::fpclassify(__x[__i])); + }); + return __r; + } + + // copysign in simd_math.h + _GLIBCXX_SIMD_MATH_FALLBACK(nextafter) + _GLIBCXX_SIMD_MATH_FALLBACK(fdim) + +#undef _GLIBCXX_SIMD_MATH_FALLBACK +#undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET + + template <typename _Tp, size_t _Np, typename _Op> + static constexpr _MaskMember<_Tp> + __fp_cmp(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y, _Op __op) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + using _VI = __sve_vector_type_t<_Ip, _Np>; + using _WI = _SveSimdWrapper<_Ip, _Np>; + const _WI __fmv = __sve_vector_type<_Ip, _Np>::__sve_broadcast(__finite_max_v<_Ip>); + const _WI __zerov = __sve_vector_type<_Ip, _Np>::__sve_broadcast(0); + const _WI __xn = _VI(__sve_reinterpret_cast<_Ip>(__x)); + const _WI __yn = _VI(__sve_reinterpret_cast<_Ip>(__y)); + + const _WI __xp + = svsel(_S_less(__xn, __zerov), _S_unary_minus(_WI(_S_bit_and(__xn, __fmv))), __xn); + const _WI __yp + = svsel(_S_less(__yn, __zerov), _S_unary_minus(_WI(_S_bit_and(__yn, __fmv))), __yn); + return svbic_z(__sve_vector_type<_Ip, _Np>::__sve_active_mask(), __op(__xp, __yp)._M_data, + _SuperImpl::_S_isunordered(__x, __y)._M_data); + } + + template <typename _Tp, size_t _Np> + static constexpr _MaskMember<_Tp> + _S_isgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less(__yp, __xp); }); } + + template <typename _Tp, size_t _Np> + static constexpr _MaskMember<_Tp> + _S_isgreaterequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less_equal(__yp, __xp); }); } + + template <typename _Tp, size_t _Np> + static constexpr _MaskMember<_Tp> + _S_isless(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less(__xp, __yp); }); } + + template <typename _Tp, size_t _Np> + static constexpr _MaskMember<_Tp> + _S_islessequal(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept + { return __fp_cmp(__x, __y, [](auto __xp, auto __yp) { return _S_less_equal(__xp, __yp); }); } + + template <typename _Tp, size_t _Np> + static constexpr _MaskMember<_Tp> + _S_islessgreater(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) noexcept + { + return svbic_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), + _SuperImpl::_S_not_equal_to(__x, __y)._M_data, + _SuperImpl::_S_isunordered(__x, __y)._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_abs(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_fabs(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svabs_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_sqrt(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svsqrt_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_ldexp(_SveSimdWrapper<_Tp, _Np> __x, __fixed_size_storage_t<int, _Np> __y) noexcept + { + auto __sve_register = __y.first; + if constexpr (std::is_same_v<_Tp, float>) + return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, + __sve_register._M_data); + else + { + __sve_vector_type_t<int64_t, _Np> __sve_d_register = svunpklo(__sve_register); + return svscale_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, + __sve_d_register); + } + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_fma(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y, + _SveSimdWrapper<_Tp, _Np> __z) + { + return svmad_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data, + __z._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_fmax(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svmaxnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_fmin(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svminnm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_isfinite([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) + { +#if __FINITE_MATH_ONLY__ + return __sve_vector_type_t<_Tp, _Np>::__sve_all_true_mask(); +#else + // if all exponent bits are set, __x is either inf or NaN + + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + const __sve_vector_type_t<_Ip, _Np> __absn = __sve_reinterpret_cast<_Ip>(_S_abs(__x)); + const __sve_vector_type_t<_Ip, _Np> __maxn + = __sve_reinterpret_cast<_Ip>( + __sve_vector_type<_Tp, _Np>::__sve_broadcast(__finite_max_v<_Tp>)); + + return _S_less_equal(_SveSimdWrapper<_Ip, _Np>{__absn}, _SveSimdWrapper<_Ip, _Np>{__maxn}); +#endif + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_isinf([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) + { +#if __FINITE_MATH_ONLY__ + return {}; // false +#else + return _S_equal_to<_Tp, _Np>(_S_abs(__x), _S_broadcast(__infinity_v<_Tp>)); +#endif + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_isnan([[maybe_unused]] _SveSimdWrapper<_Tp, _Np> __x) + { +#if __FINITE_MATH_ONLY__ + return {}; // false +#else + return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __x._M_data); +#endif + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_isnormal(_SveSimdWrapper<_Tp, _Np> __x) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + using _V = __sve_vector_type_t<_Ip, _Np>; + using _VW = _SveSimdWrapper<_Ip, _Np>; + + const _V __absn = __sve_reinterpret_cast<_Ip>(_S_abs(__x)); + const _V __minn = __sve_reinterpret_cast<_Ip>( + __sve_vector_type<_Tp, _Np>::__sve_broadcast(__norm_min_v<_Tp>)); +#if __FINITE_MATH_ONLY__ + return _S_greater_equal(_VW{__absn}, _VW{__minn}); +#else + const _V __maxn = __sve_reinterpret_cast<_Ip>( + __sve_vector_type<_Tp, _Np>::__sve_broadcast(__finite_max_v<_Tp>)); + return _MaskImpl::_S_bit_and(_S_less_equal(_VW{__minn}, _VW{__absn}), + _S_less_equal(_VW{__absn}, _VW{__maxn})); +#endif + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_signbit(_SveSimdWrapper<_Tp, _Np> __x) + { + using _Ip = __get_sve_value_type_t<__int_for_sizeof_t<_Tp>>; + using _V = __sve_vector_type_t<_Ip, _Np>; + using _VW = _SveSimdWrapper<_Ip, _Np>; + + const _V __xn = __sve_reinterpret_cast<_Ip>(__x); + const _V __zeron = __sve_vector_type<_Ip, _Np>::__sve_broadcast(0); + return _S_less(_VW{__xn}, _VW{__zeron}); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_isunordered(_SveSimdWrapper<_Tp, _Np> __x, _SveSimdWrapper<_Tp, _Np> __y) + { + return svcmpuo(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data, __y._M_data); + } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_nearbyint(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svrinti_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_rint(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return _SuperImpl::_S_nearbyint(__x); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_trunc(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svrintz_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_round(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svrinta_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_floor(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svrintm_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static _SveSimdWrapper<_Tp, _Np> + _S_ceil(_SveSimdWrapper<_Tp, _Np> __x) noexcept + { return svrintp_z(__sve_vector_type<_Tp, _Np>::__sve_active_mask(), __x._M_data); } + + template <typename _Tp, size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs, + __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs) + { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); } + + template <typename _Tp, size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs, + __type_identity_t<_Tp> __rhs) + { __lhs = _CommonImpl::_S_blend(__k, __lhs, __data(simd<_Tp, _Abi>(__rhs))); } + + template <typename _Op, typename _Tp, size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs, + const __type_identity_t<_SveSimdWrapper<_Tp, _Np>> __rhs, _Op __op) + { + __lhs = _CommonImpl::_S_blend(__k, __lhs, + _SveSimdWrapper<_Tp, _Np>(__op(_SuperImpl{}, __lhs, __rhs))); + } + + template <typename _Op, typename _Tp, size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_masked_cassign(const _SveMaskWrapper<_Bits, _Np> __k, _SveSimdWrapper<_Tp, _Np>& __lhs, + const __type_identity_t<_Tp> __rhs, _Op __op) + { _S_masked_cassign(__k, __lhs, _S_broadcast(__rhs), __op); } + + template <typename _Tp, size_t _Np, typename _Up> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_set(_SveSimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept + { __v._M_set(__i, static_cast<_Up&&>(__x)); } + + template <template <typename> class _Op, typename _Tp, size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveSimdWrapper<_Tp, _Np> + _S_masked_unary(const _SveMaskWrapper<_Bits, _Np> __k, const _SveSimdWrapper<_Tp, _Np> __v) + { + auto __vv = simd<_Tp, _Abi>{__private_init, __v}; + _Op<decltype(__vv)> __op; + return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv))); + } + }; + +template <typename _Abi, typename> + struct _MaskImplSve + { + template <typename _Tp> + using _MaskMember = typename _Abi::template _MaskMember<_Tp>; + + template <typename _Tp> + using _TypeTag = _Tp*; + + template <typename _Tp> + static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>; + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_broadcast(bool __x) + { + constexpr size_t _Np = simd_size_v<_Tp, _Abi>; + __sve_bool_type __tr = __sve_vector_type<_Tp, _Np>::__sve_active_mask(); + __sve_bool_type __fl = svpfalse_b();; + return __x ? __tr : __fl; + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_load(const bool* __mem) + { + constexpr size_t _Np = simd_size_v<_Tp, _Abi>; + const uint8_t* __p = reinterpret_cast<const uint8_t*>(__mem); + __sve_bool_type __u8_active_mask = __sve_vector_type<uint8_t, _Np>::__sve_active_mask(); + __sve_vector_type_t<uint8_t, _Np> __u8_vec_mask_load = svld1(__u8_active_mask, __p); + __sve_bool_type __u8_mask = svcmpne(__u8_active_mask, __u8_vec_mask_load, 0); + + __sve_bool_type __tp_mask = __u8_mask; + for (size_t __up_size = 1; __up_size != sizeof(_Tp); __up_size *= 2) + { + __tp_mask = svunpklo(__tp_mask); + } + + _SveMaskWrapper<sizeof(_Tp), simd_size_v<_Tp, _Abi>> __r{__tp_mask}; + return __r; + } + + template <size_t _Bits, size_t _Np> + static inline _SveMaskWrapper<_Bits, _Np> + _S_masked_load(_SveMaskWrapper<_Bits, _Np> __merge, _SveMaskWrapper<_Bits, _Np> __mask, + const bool* __mem) noexcept + { + _SveMaskWrapper<_Bits, _Np> __r; + + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if (__mask[__i]) + __r._M_set(__i, __mem[__i]); + else + __r._M_set(__i, __merge[__i]); + }); + + return __r; + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_store(_SveMaskWrapper<_Bits, _Np> __v, bool* __mem) noexcept + { + __execute_n_times<_Np>([&](auto __i) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __mem[__i] = __v[__i]; }); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr void + _S_masked_store(const _SveMaskWrapper<_Bits, _Np> __v, bool* __mem, + const _SveMaskWrapper<_Bits, _Np> __k) noexcept + { + __execute_n_times<_Np>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { + if (__k[__i]) + __mem[__i] = __v[__i]; + }); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np> + _S_to_bits(_SveMaskWrapper<_Bits, _Np> __x) + { + _ULLong __r = 0; + __execute_n_times<_Np>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r |= _ULLong(__x[__i]) << __i; }); + return __r; + } + + template <size_t _Np, typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp> + _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>) + { + _SveMaskWrapper<sizeof(_Tp), _Np> __r; + __execute_n_times<_Np>([&](auto __i) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __bits[__i]); }); + return __r; + } + + template <typename _Tp, typename _Up, typename _UAbi> + _GLIBCXX_SIMD_INTRINSIC static constexpr auto + _S_convert(simd_mask<_Up, _UAbi> __x) + { + using _R = _SveMaskWrapper<sizeof(_Tp), simd_size_v<_Tp, _Abi>>; + if constexpr (__is_scalar_abi<_UAbi>()) + { + _R __r{__sve_bool_type(svpfalse())}; + __r._M_set(0, __data(__x)); + return __r; + } + if constexpr (__is_sve_abi<_UAbi>()) + { + if constexpr (sizeof(_Up) == sizeof(_Tp)) + return __data(__x); + if constexpr (sizeof(_Up) < sizeof(_Tp)) + { + __sve_bool_type __xmdata = __data(__x)._M_data; + __sve_bool_type __r = __xmdata; + for (size_t __up_size = sizeof(_Up); __up_size != sizeof(_Tp); __up_size *= 2) + { + __r = svunpklo(__r); + } + return _R{__r}; + } + else + { + _R __r{__sve_bool_type(svpfalse())}; + constexpr size_t __min_size + = std::min(simd_size_v<_Tp, _Abi>, simd_mask<_Up, _UAbi>::size()); + __execute_n_times<__min_size>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); }); + return __r; + } + } + if constexpr (__is_neon_abi<_UAbi>()) + { + _R __r{__sve_bool_type(svpfalse())}; + constexpr size_t __min_size + = std::min(simd_size_v<_Tp, _Abi>, simd_mask<_Up, _UAbi>::size()); + __execute_n_times<__min_size>( + [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); }); + return __r; + } + if constexpr (__is_fixed_size_abi<_UAbi>()) + { + return _S_convert<_Tp>(__data(__x)); + } + return _R{}; + } + + template <typename _Tp, size_t _Np, bool _Sanitized> + _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp> + _S_convert(_BitMask<_Np, _Sanitized> __x) + { + _MaskMember<_Tp> __r{}; + __execute_n_times<_Np>([&](auto __i) + _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { __r._M_set(__i, __x[__i]); }); + return __r; + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_logical_and(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y) + { + return svand_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_logical_or(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y) + { + return svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_bit_not(const _SveMaskWrapper<_Bits, _Np>& __x) + { + return svnot_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_bit_and(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y) + { + return svand_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_bit_or(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y) + { + return svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static constexpr _SveMaskWrapper<_Bits, _Np> + _S_bit_xor(const _SveMaskWrapper<_Bits, _Np>& __x, const _SveMaskWrapper<_Bits, _Np>& __y) + { + return sveor_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __x._M_data, __y._M_data); + } + + template <size_t _Bits, size_t _Np> + static constexpr void + _S_set(_SveMaskWrapper<_Bits, _Np>& __k, int __i, bool __x) noexcept + { + auto __index = svcmpeq(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __sve_mask_type<_Bits>::__index0123, + typename __sve_mask_type<_Bits>::__sve_mask_uint_type(__i)); + if (__x) + __k._M_data = svorr_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __k._M_data, __index); + else + __k._M_data = svbic_z(_SveMaskWrapper<_Bits, _Np>::_BuiltinSveVectorType::__sve_active_mask(), + __k._M_data, __index); + } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static void + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveMaskWrapper<_Bits, _Np>& __lhs, + _SveMaskWrapper<_Bits, _Np> __rhs) + { __lhs._M_data = svsel(__k._M_data, __rhs._M_data, __lhs._M_data); } + + template <size_t _Bits, size_t _Np> + _GLIBCXX_SIMD_INTRINSIC static void + _S_masked_assign(_SveMaskWrapper<_Bits, _Np> __k, _SveMaskWrapper<_Bits, _Np>& __lhs, + bool __rhs) + { + __lhs._M_data + = svsel(__k._M_data, _S_broadcast<__int_with_sizeof_t<_Bits>>(__rhs), __lhs._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static int + _S_popcount(simd_mask<_Tp, _Abi> __k) + { + constexpr size_t _Np = simd_size_v<_Tp, _Abi>; + + return __sve_mask_type<sizeof(_Tp)>::__sve_mask_active_count( + __sve_vector_type<_Tp, _Np>::__sve_active_mask(), __k._M_data); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static bool + _S_all_of(simd_mask<_Tp, _Abi> __k) + { return _S_popcount(__k) == simd_size_v<_Tp, _Abi>; } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static bool + _S_any_of(simd_mask<_Tp, _Abi> __k) + { return svptest_any(__sve_vector_type<_Tp, simd_size_v<_Tp, _Abi>>::__sve_active_mask(), __k._M_data); } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static bool + _S_none_of(simd_mask<_Tp, _Abi> __k) + { return !svptest_any(__sve_vector_type<_Tp, simd_size_v<_Tp, _Abi>>::__sve_active_mask(), __k._M_data); } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static bool + _S_some_of(simd_mask<_Tp, _Abi> __k) + { + int __msk_count = _S_popcount(__k); + return (__msk_count > 0) && (__msk_count < (int) simd_size_v<_Tp, _Abi>); + } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static int + _S_find_first_set(simd_mask<_Tp, _Abi> __k) + { return svclastb(svpfirst(__k._M_data, svpfalse()), -1, __sve_mask_type<sizeof(_Tp)>::__index0123); } + + template <typename _Tp> + _GLIBCXX_SIMD_INTRINSIC static int + _S_find_last_set(simd_mask<_Tp, _Abi> __k) + { return svclastb(__k._M_data, -1, __sve_mask_type<sizeof(_Tp)>::__index0123); } + }; + +_GLIBCXX_SIMD_END_NAMESPACE +#endif // __cplusplus >= 201703L +#endif // _GLIBCXX_EXPERIMENTAL_SIMD_SVE_H_ +// vim: sw=2 noet ts=8 sts=2 tw=100 diff --git a/libstdc++-v3/include/experimental/simd b/libstdc++-v3/include/experimental/simd index 2b9bdf9239a..1cb70713ebc 100644 --- a/libstdc++-v3/include/experimental/simd +++ b/libstdc++-v3/include/experimental/simd @@ -80,6 +80,9 @@ #include "bits/simd_x86.h" #elif _GLIBCXX_SIMD_HAVE_NEON #include "bits/simd_neon.h" +#if _GLIBCXX_SIMD_HAVE_SVE +#include "bits/simd_sve.h" +#endif #elif __ALTIVEC__ #include "bits/simd_ppc.h" #endif diff --git a/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h b/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h index 270b433aa17..880495cda34 100644 --- a/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h +++ b/libstdc++-v3/testsuite/experimental/simd/tests/bits/main.h @@ -29,6 +29,9 @@ template <class T> invoke_test<simd<T, simd_abi::scalar>>(int()); invoke_test<simd<T, simd_abi::_VecBuiltin<16>>>(int()); invoke_test<simd<T, simd_abi::_VecBltnBtmsk<64>>>(int()); + invoke_test<simd<T, simd_abi::_SveAbi<16>>>(int()); + invoke_test<simd<T, simd_abi::_SveAbi<32>>>(int()); + invoke_test<simd<T, simd_abi::_SveAbi<64>>>(int()); #elif EXTENDEDTESTS == 0 invoke_test<simd<T, simd_abi::_VecBuiltin<8>>>(int()); invoke_test<simd<T, simd_abi::_VecBuiltin<12>>>(int()); -- 2.34.1