[PATCH] D44661: [libcxx] optimize reduce(), hmin(), hmax() by reordering the operations.

Tim Shen via Phabricator via cfe-commits Mon, 19 Mar 2018 17:47:36 -0700

timshen created this revision.
timshen added a reviewer: mclow.lists.
Herald added subscribers: christof, sanjoy.
Herald added a reviewer: EricWF.


Also change std::plus<_Tp> to std::plus<>/__simd_plus_op, so that the
optimization can transparently use the simd<> overloading.


https://reviews.llvm.org/D44661

Files:
  libcxx/include/experimental/simd
  libcxx/test/std/experimental/simd/simd.horizontal/hmax.pass.cpp
  libcxx/test/std/experimental/simd/simd.horizontal/hmin.pass.cpp
  libcxx/test/std/experimental/simd/simd.horizontal/reduce.pass.cpp

Index: libcxx/test/std/experimental/simd/simd.horizontal/reduce.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.horizontal/reduce.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.horizontal/reduce.pass.cpp
@@ -38,52 +38,64 @@
 #include <cstdint>
 #include <experimental/simd>
 
+#include "test_macros.h"
+
 using namespace std::experimental::parallelism_v2;
 
 inline int factorial(int n) { return n == 1 ? 1 : n * factorial(n - 1); }
 
+template <class SimdType>
 void test_reduce_simd() {
-  int n = (int)native_simd<int>::size();
-  assert(reduce(native_simd<int>([](int i) { return i; })) == n * (n - 1) / 2);
-  assert(reduce(native_simd<int>([](int i) { return i; }), std::plus<int>()) ==
+  int n = (int)SimdType::size();
+  assert(reduce(SimdType([](int i) { return i; })) == n * (n - 1) / 2);
+
+#if TEST_STD_VER >= 14
+  assert(reduce(SimdType([](int i) { return i; }), std::plus<>()) ==
          n * (n - 1) / 2);
-  assert(reduce(native_simd<int>([](int i) { return i + 1; }),
-                std::multiplies<int>()) == factorial(n));
+  assert(reduce(SimdType([](int i) { return i + 1; }), std::multiplies<>()) ==
+         factorial(n));
+#endif
 }
 
 void test_reduce_mask() {
   {
     fixed_size_simd<int, 4> a([](int i) { return i; });
-    assert(reduce(where(a < 2, a), 0, std::plus<int>()) == 0 + 1);
-    assert(reduce(where(a >= 2, a), 1, std::multiplies<int>()) == 2 * 3);
     assert(reduce(where(a >= 2, a)) == 2 + 3);
-    assert(reduce(where(a >= 2, a), std::plus<int>()) == 2 + 3);
-    assert(reduce(where(a >= 2, a), std::multiplies<int>()) == 2 * 3);
-    assert(reduce(where(a >= 2, a), std::bit_and<int>()) == (2 & 3));
-    assert(reduce(where(a >= 2, a), std::bit_or<int>()) == (2 | 3));
-    assert(reduce(where(a >= 2, a), std::bit_xor<int>()) == (2 ^ 3));
+#if TEST_STD_VER >= 14
+    assert(reduce(where(a < 2, a), 0, std::plus<>()) == 0 + 1);
+    assert(reduce(where(a >= 2, a), 1, std::multiplies<>()) == 2 * 3);
+    assert(reduce(where(a >= 2, a), std::plus<>()) == 2 + 3);
+    assert(reduce(where(a >= 2, a), std::multiplies<>()) == 2 * 3);
+    assert(reduce(where(a >= 2, a), std::bit_and<>()) == (2 & 3));
+    assert(reduce(where(a >= 2, a), std::bit_or<>()) == (2 | 3));
+    assert(reduce(where(a >= 2, a), std::bit_xor<>()) == (2 ^ 3));
+#endif
   }
   {
     fixed_size_simd_mask<int, 4> a;
     a[0] = false;
     a[1] = true;
     a[2] = true;
     a[3] = false;
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a)) == true);
+#if TEST_STD_VER >= 14
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a),
-                  std::plus<bool>()) == true);
+                  std::plus<>()) == true);
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a),
-                  std::multiplies<bool>()) == false);
+                  std::multiplies<>()) == false);
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a),
-                  std::bit_and<bool>()) == false);
+                  std::bit_and<>()) == false);
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a),
-                  std::bit_or<bool>()) == true);
+                  std::bit_or<>()) == true);
     assert(reduce(where(fixed_size_simd_mask<int, 4>(true), a),
-                  std::bit_xor<bool>()) == false);
+                  std::bit_xor<>()) == false);
+#endif
   }
 }
 
 int main() {
-  test_reduce_simd();
+  test_reduce_simd<native_simd<int>>();
+  test_reduce_simd<fixed_size_simd<int, 4>>();
+  test_reduce_simd<fixed_size_simd<int, 5>>();
   test_reduce_mask();
 }
Index: libcxx/test/std/experimental/simd/simd.horizontal/hmin.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.horizontal/hmin.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.horizontal/hmin.pass.cpp
@@ -20,22 +20,47 @@
 
 using namespace std::experimental::parallelism_v2;
 
-void test_hmin_simd() {
+template <class SimdType>
+void test_hmin_simd_power_of_2() {
   {
     int a[] = {2, 5, -4, 6};
-    assert(hmin(fixed_size_simd<int, 4>(a, element_aligned_tag())) == -4);
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
   }
   {
     int a[] = {6, 2, 5, -4};
-    assert(hmin(fixed_size_simd<int, 4>(a, element_aligned_tag())) == -4);
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
   }
   {
     int a[] = {-4, 6, 2, 5};
-    assert(hmin(fixed_size_simd<int, 4>(a, element_aligned_tag())) == -4);
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
   }
   {
     int a[] = {5, -4, 6, 2};
-    assert(hmin(fixed_size_simd<int, 4>(a, element_aligned_tag())) == -4);
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
+  }
+}
+
+template <class SimdType>
+void test_hmin_simd() {
+  {
+    int a[] = {0, 2, 5, -4, 6};
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
+  }
+  {
+    int a[] = {0, 6, 2, 5, -4};
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
+  }
+  {
+    int a[] = {-4, 0, 5, 6, 2};
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
+  }
+  {
+    int a[] = {0, -4, 6, 2, 5};
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
+  }
+  {
+    int a[] = {0, 5, -4, 6, 2};
+    assert(hmin(SimdType(a, element_aligned_tag())) == -4);
   }
 }
 
@@ -60,6 +85,12 @@
 }
 
 int main() {
-  test_hmin_simd();
+  test_hmin_simd_power_of_2<fixed_size_simd<int, 4>>();
+  test_hmin_simd_power_of_2<
+      simd<int, rebind_abi_t<int, 4, simd_abi::native<int>>>>();
+  test_hmin_simd<fixed_size_simd<int, 5>>();
+  test_hmin_simd<simd<int, rebind_abi_t<int, 5, simd_abi::native<int>>>>();
+  test_hmin_simd<fixed_size_simd<int, 5>>();
+  test_hmin_simd<simd<int, rebind_abi_t<int, 5, simd_abi::native<int>>>>();
   test_hmin_mask();
 }
Index: libcxx/test/std/experimental/simd/simd.horizontal/hmax.pass.cpp
===================================================================
--- libcxx/test/std/experimental/simd/simd.horizontal/hmax.pass.cpp
+++ libcxx/test/std/experimental/simd/simd.horizontal/hmax.pass.cpp
@@ -20,22 +20,47 @@
 
 using namespace std::experimental::parallelism_v2;
 
-void test_hmax_simd() {
+template <class SimdType>
+void test_hmax_simd_power_of_2() {
   {
     int a[] = {2, 5, -4, 6};
-    assert(hmax(fixed_size_simd<int, 4>(a, element_aligned_tag())) == 6);
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
   }
   {
     int a[] = {6, 2, 5, -4};
-    assert(hmax(fixed_size_simd<int, 4>(a, element_aligned_tag())) == 6);
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
   }
   {
     int a[] = {-4, 6, 2, 5};
-    assert(hmax(fixed_size_simd<int, 4>(a, element_aligned_tag())) == 6);
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
   }
   {
     int a[] = {5, -4, 6, 2};
-    assert(hmax(fixed_size_simd<int, 4>(a, element_aligned_tag())) == 6);
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
+  }
+}
+
+template <class SimdType>
+void test_hmax_simd() {
+  {
+    int a[] = {0, 2, 5, -4, 6};
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
+  }
+  {
+    int a[] = {6, 0, 2, 5, -4};
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
+  }
+  {
+    int a[] = {0, 6, 2, 5, -4};
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
+  }
+  {
+    int a[] = {0, -4, 6, 2, 5};
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
+  }
+  {
+    int a[] = {0, 5, -4, 6, 2};
+    assert(hmax(SimdType(a, element_aligned_tag())) == 6);
   }
 }
 
@@ -67,6 +92,10 @@
 }
 
 int main() {
-  test_hmax_simd();
+  test_hmax_simd_power_of_2<fixed_size_simd<int, 4>>();
+  test_hmax_simd_power_of_2<
+      simd<int, rebind_abi_t<int, 4, simd_abi::native<int>>>>();
+  test_hmax_simd<fixed_size_simd<int, 5>>();
+  test_hmax_simd<simd<int, rebind_abi_t<int, 5, simd_abi::native<int>>>>();
   test_hmax_mask();
 }
Index: libcxx/include/experimental/simd
===================================================================
--- libcxx/include/experimental/simd
+++ libcxx/include/experimental/simd
@@ -1368,6 +1368,19 @@
 };
 #endif // !defined(_LIBCPP_HAS_NO_INT128)
 
+#if _LIBCPP_STD_VER > 11
+using __simd_plus_op = std::plus<>;
+#else
+struct __simd_plus_op {
+  template <class _T1, class _T2>
+  inline auto operator()(_T1&& __t, _T2&& __u) const
+      noexcept(noexcept(std::forward<_T1>(__t) + std::forward<_T2>(__u)))
+          -> decltype(std::forward<_T1>(__t) + std::forward<_T2>(__u)) {
+    return std::forward<_T1>(__t) + std::forward<_T2>(__u);
+  }
+};
+#endif
+
 _LIBCPP_END_NAMESPACE_EXPERIMENTAL_SIMD
 _LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL_SIMD_ABI
 
@@ -2071,35 +2084,124 @@
   return 0;
 }
 
+template <class _Tp, class _Abi, size_t... __indices>
+std::array<simd<_Tp, rebind_abi_t<_Tp, simd<_Tp, _Abi>::size() / 2, _Abi>>, 2>
+__deinterleave_impl(const simd<_Tp, _Abi>& __v,
+                    std::index_sequence<__indices...>) {
+  return {{__simd_shuffle<(2 * __indices)...>(__v, __v),
+           __simd_shuffle<(2 * __indices + 1)...>(__v, __v)}};
+}
+
+template <class _Tp, class _Abi>
+std::array<simd<_Tp, rebind_abi_t<_Tp, simd<_Tp, _Abi>::size() / 2, _Abi>>, 2>
+__deinterleave(const simd<_Tp, _Abi>& __v) {
+  static_assert(simd<_Tp, _Abi>::size() % 2 == 0, "");
+  return __deinterleave_impl(
+      __v, std::make_index_sequence<simd<_Tp, _Abi>::size() / 2>());
+}
+
 // reductions [simd.reductions]
 template <class _SimdType, class _BinaryOp>
-typename _SimdType::value_type __reduce(const _SimdType& __v, _BinaryOp __op) {
+typename std::enable_if<_SimdType::size() == 1,
+                        typename _SimdType::value_type>::type
+__reduce(const _SimdType& __v, _BinaryOp) {
+  return __v[0];
+}
+
+template <class _SimdType, class _BinaryOp>
+typename std::enable_if<(_SimdType::size() > 1 && is_simd<_SimdType>::value &&
+                         __floor_pow_of_2(_SimdType::size()) ==
+                             _SimdType::size()),
+                        typename _SimdType::value_type>::type
+__reduce(const _SimdType& __v, _BinaryOp __op) {
+  if (std::is_same<_BinaryOp, __simd_plus_op>::value) {
+    using _Tp = typename _SimdType::value_type;
+    if (std::is_integral<_Tp>::value && sizeof(_Tp) < 8) {
+      auto __arr = __deinterleave(__v);
+      return __reduce(__arr[0] + __arr[1], __op);
+    }
+  }
+  auto __arr = split_by<2>(__v);
+  return __reduce(__op(__arr[0], __arr[1]), __op);
+}
+
+template <class _SimdType, class _BinaryOp>
+typename std::enable_if<(_SimdType::size() > 1 &&
+                         !(is_simd<_SimdType>::value &&
+                           __floor_pow_of_2(_SimdType::size()) ==
+                               _SimdType::size())),
+                        typename _SimdType::value_type>::type
+__reduce(const _SimdType& __v, _BinaryOp __op) {
   auto __acc = __v[0];
   for (size_t __i = 1; __i < __v.size(); __i++) {
     __acc = __op(__acc, __v[__i]);
   }
   return __acc;
 }
 
 template <class _SimdType>
-typename _SimdType::value_type __hmin(const _SimdType& __v) {
+typename std::enable_if<_SimdType::size() == 1,
+                        typename _SimdType::value_type>::type
+__hmin(const _SimdType& __v) {
+  return __v[0];
+}
+
+template <class _SimdType>
+typename std::enable_if<(_SimdType::size() > 1 && is_simd<_SimdType>::value &&
+                         __floor_pow_of_2(_SimdType::size()) ==
+                             _SimdType::size()),
+                        typename _SimdType::value_type>::type
+__hmin(const _SimdType& __v) {
+  auto __arr = split_by<2>(__v);
+  return __hmin(min(__arr[0], __arr[1]));
+}
+
+template <class _SimdType>
+typename std::enable_if<(_SimdType::size() > 1 &&
+                         !(is_simd<_SimdType>::value &&
+                           __floor_pow_of_2(_SimdType::size()) ==
+                               _SimdType::size())),
+                        typename _SimdType::value_type>::type
+__hmin(const _SimdType& __v) {
   auto __acc = __v[0];
   for (size_t __i = 1; __i < __v.size(); __i++) {
     __acc = __acc > __v[__i] ? __v[__i] : __acc;
   }
   return __acc;
 }
 
 template <class _SimdType>
-typename _SimdType::value_type __hmax(const _SimdType& __v) {
+typename std::enable_if<_SimdType::size() == 1,
+                        typename _SimdType::value_type>::type
+__hmax(const _SimdType& __v) {
+  return __v[0];
+}
+
+template <class _SimdType>
+typename std::enable_if<(_SimdType::size() > 1 && is_simd<_SimdType>::value &&
+                         __floor_pow_of_2(_SimdType::size()) ==
+                             _SimdType::size()),
+                        typename _SimdType::value_type>::type
+__hmax(const _SimdType& __v) {
+  auto __arr = split_by<2>(__v);
+  return __hmax(max(__arr[0], __arr[1]));
+}
+
+template <class _SimdType>
+typename std::enable_if<(_SimdType::size() > 1 &&
+                         !(is_simd<_SimdType>::value &&
+                           __floor_pow_of_2(_SimdType::size()) ==
+                               _SimdType::size())),
+                        typename _SimdType::value_type>::type
+__hmax(const _SimdType& __v) {
   auto __acc = __v[0];
   for (size_t __i = 1; __i < __v.size(); __i++) {
     __acc = __acc < __v[__i] ? __v[__i] : __acc;
   }
   return __acc;
 }
 
-template <class _Tp, class _Abi, class _BinaryOp = std::plus<_Tp>>
+template <class _Tp, class _Abi, class _BinaryOp = __simd_plus_op>
 _Tp reduce(const simd<_Tp, _Abi>& __v, _BinaryOp __op = _BinaryOp()) {
   return __reduce(__v, __op);
 }
@@ -3017,38 +3119,41 @@
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
 reduce(const const_where_expression<_MaskType, _SimdType>& __w,
-       plus<typename _SimdType::value_type> __op = {}) {
+       __simd_plus_op __op = {}) {
   return reduce(__w, typename _SimdType::value_type(0), __op);
 }
 
+#if _LIBCPP_STD_VER > 11
+
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
 reduce(const const_where_expression<_MaskType, _SimdType>& __w,
-       multiplies<typename _SimdType::value_type> __op) {
+       multiplies<> __op) {
   return reduce(__w, typename _SimdType::value_type(1), __op);
 }
 
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
 reduce(const const_where_expression<_MaskType, _SimdType>& __w,
-       bit_and<typename _SimdType::value_type> __op) {
+       bit_and<> __op) {
   return reduce(__w, typename _SimdType::value_type(-1), __op);
 }
 
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
-reduce(const const_where_expression<_MaskType, _SimdType>& __w,
-       bit_or<typename _SimdType::value_type> __op) {
+reduce(const const_where_expression<_MaskType, _SimdType>& __w, bit_or<> __op) {
   return reduce(__w, typename _SimdType::value_type(0), __op);
 }
 
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
 reduce(const const_where_expression<_MaskType, _SimdType>& __w,
-       bit_xor<typename _SimdType::value_type> __op) {
+       bit_xor<> __op) {
   return reduce(__w, typename _SimdType::value_type(0), __op);
 }
 
+#endif // _LIBCPP_STD_VER > 11
+
 template <class _MaskType, class _SimdType>
 typename _SimdType::value_type
 hmin(const const_where_expression<_MaskType, _SimdType>& __w) {

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D44661: [libcxx] optimize reduce(), hmin(), hmax() by reordering the operations.

Reply via email to