>From 7a7c9a2a976fbb29f67c46284e7c1581cbe8cb07 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik <abhishek.kaus...@intel.com> Date: Fri, 31 Jan 2025 01:28:48 -0800 Subject: [PATCH] libstdc++: Use canonical loop form in std::reduce
This change is for the INTEL C compiler (icx). The current while loop in std::reduce and related functions is hard to vectorize because the loop control variable is hard to detect in icx. `while ((__last - __first) >= 4)` Changing the loop header to a for loop following the OpenMP canonical form allows easy vectorization, resulting in improved performance. `for (; __first <= __last - 4; __first += 4)` This patch modifies the loop header for std::reduce & std::transform_reduce. --- libstdc++-v3/include/std/numeric | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/libstdc++-v3/include/std/numeric b/libstdc++-v3/include/std/numeric index 4d36fcd36d9..9c38ad89e21 100644 --- a/libstdc++-v3/include/std/numeric +++ b/libstdc++-v3/include/std/numeric @@ -300,13 +300,12 @@ namespace __detail static_assert(is_invocable_r_v<_Tp, _BinaryOperation&, __ref, __ref>); if constexpr (__is_random_access_iter<_InputIterator>::value) { - while ((__last - __first) >= 4) + for (; __first <= __last - 4; __first += 4) { _Tp __v1 = __binary_op(__first[0], __first[1]); _Tp __v2 = __binary_op(__first[2], __first[3]); _Tp __v3 = __binary_op(__v1, __v2); __init = __binary_op(__init, __v3); - __first += 4; } } for (; __first != __last; ++__first) @@ -381,7 +380,7 @@ namespace __detail if constexpr (__and_v<__is_random_access_iter<_InputIterator1>, __is_random_access_iter<_InputIterator2>>) { - while ((__last1 - __first1) >= 4) + for (; __first1 <= __last1 - 4; __first1 += 4, __first2 += 4) { _Tp __v1 = __binary_op1(__binary_op2(__first1[0], __first2[0]), __binary_op2(__first1[1], __first2[1])); @@ -389,8 +388,6 @@ namespace __detail __binary_op2(__first1[3], __first2[3])); _Tp __v3 = __binary_op1(__v1, __v2); __init = __binary_op1(__init, __v3); - __first1 += 4; - __first2 += 4; } } for (; __first1 != __last1; ++__first1, (void) ++__first2) @@ -447,7 +444,7 @@ namespace __detail { if constexpr (__is_random_access_iter<_InputIterator>::value) { - while ((__last - __first) >= 4) + for (; __first <= __last - 4; __first += 4) { _Tp __v1 = __binary_op(__unary_op(__first[0]), __unary_op(__first[1])); @@ -455,7 +452,6 @@ namespace __detail __unary_op(__first[3])); _Tp __v3 = __binary_op(__v1, __v2); __init = __binary_op(__init, __v3); - __first += 4; } } for (; __first != __last; ++__first) -- 2.31.1 ________________________________ From: Abhishek Kaushik Sent: Friday, January 31, 2025 4:28 PM To: libstd...@gcc.gnu.org <libstd...@gcc.gnu.org> Cc: gcc-patches@gcc.gnu.org <gcc-patches@gcc.gnu.org> Subject: [PATCH] libstdc++: Use canonical loop form in std::reduce >From 4ac7c7e56e23ed2f4dd2dafdfab6cfa110c14260 Mon Sep 17 00:00:00 2001 From: Abhishek Kaushik <abhishek.kaus...@intel.com> Date: Fri, 31 Jan 2025 01:28:48 -0800 Subject: [PATCH] libstdc++: Use canonical loop form in std::reduce The current while loop in std::reduce and related functions is hard to vectorize because the loop control variable is hard to detect. `while ((__last - __first) >= 4)` Changing the loop header to a for loop following the OpenMP canonical form allows easy vectorization, resulting in improved performance. `for (; __first <= __last - 4; __first += 4)` This patch modifies the loop header for std::reduce & std::transform_reduce. --- libstdc++-v3/include/std/numeric | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/libstdc++-v3/include/std/numeric b/libstdc++-v3/include/std/numeric index 4d36fcd36d9..9c38ad89e21 100644 --- a/libstdc++-v3/include/std/numeric +++ b/libstdc++-v3/include/std/numeric @@ -300,13 +300,12 @@ namespace __detail static_assert(is_invocable_r_v<_Tp, _BinaryOperation&, __ref, __ref>); if constexpr (__is_random_access_iter<_InputIterator>::value) { - while ((__last - __first) >= 4) + for (; __first <= __last - 4; __first += 4) { _Tp __v1 = __binary_op(__first[0], __first[1]); _Tp __v2 = __binary_op(__first[2], __first[3]); _Tp __v3 = __binary_op(__v1, __v2); __init = __binary_op(__init, __v3); - __first += 4; } } for (; __first != __last; ++__first) @@ -381,7 +380,7 @@ namespace __detail if constexpr (__and_v<__is_random_access_iter<_InputIterator1>, __is_random_access_iter<_InputIterator2>>) { - while ((__last1 - __first1) >= 4) + for (; __first1 <= __last1 - 4; __first1 += 4, __first2 += 4) { _Tp __v1 = __binary_op1(__binary_op2(__first1[0], __first2[0]), __binary_op2(__first1[1], __first2[1])); @@ -389,8 +388,6 @@ namespace __detail __binary_op2(__first1[3], __first2[3])); _Tp __v3 = __binary_op1(__v1, __v2); __init = __binary_op1(__init, __v3); - __first1 += 4; - __first2 += 4; } } for (; __first1 != __last1; ++__first1, (void) ++__first2) @@ -447,7 +444,7 @@ namespace __detail { if constexpr (__is_random_access_iter<_InputIterator>::value) { - while ((__last - __first) >= 4) + for (; __first <= __last - 4; __first += 4) { _Tp __v1 = __binary_op(__unary_op(__first[0]), __unary_op(__first[1])); @@ -455,7 +452,6 @@ namespace __detail __unary_op(__first[3])); _Tp __v3 = __binary_op(__v1, __v2); __init = __binary_op(__init, __v3); - __first += 4; } } for (; __first != __last; ++__first) -- 2.31.1