sc/source/core/inc/arraysumfunctor.hxx | 58 ++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 19 deletions(-)
New commits: commit f814b00bc908c5498156194f45bf8f9c0b8268ac Author: Tomaž Vajngerl <tomaz.vajng...@collabora.com> Date: Thu Nov 5 00:32:44 2015 +0100 Fast array sum: aligned load, process 8 doubles per loop * isAligned checks if a pointer address is aligned to required number of bytes. * Process the array until we are aligned.. Change-Id: Id42a8a2628b2797f7870ec8cd29a183087f9911e diff --git a/sc/source/core/inc/arraysumfunctor.hxx b/sc/source/core/inc/arraysumfunctor.hxx index 9e4ce97..fc1b915 100644 --- a/sc/source/core/inc/arraysumfunctor.hxx +++ b/sc/source/core/inc/arraysumfunctor.hxx @@ -17,6 +17,12 @@ namespace sc { +template<typename T, unsigned int N> +inline bool isAligned(const T* pointer) +{ + return 0 == (uintptr_t(pointer) % N); +} + struct ArraySumFunctor { private: @@ -30,18 +36,26 @@ public: { } - double operator() () const + double operator() () { static bool hasSSE2 = tools::cpuid::hasSSE2(); printf("SSE used %d\n", hasSSE2); double fSum = 0.0; size_t i = 0; + const double* pCurrent = mpArray; if (hasSSE2) - fSum += executeSSE2(i); + { + while (!isAligned<double, 16>(pCurrent)) + { + fSum += *pCurrent++; + i++; + } + fSum += executeSSE2(i, pCurrent); + } else - fSum += executeUnrolled(i); + fSum += executeUnrolled(i, pCurrent); // sum rest of the array @@ -52,27 +66,34 @@ public: } private: - inline double executeSSE2(size_t& i) const + inline double executeSSE2(size_t& i, const double* pCurrent) const { double fSum = 0.0; - size_t nUnrolledSize = mnSize - (mnSize % 4); + size_t nRealSize = mnSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 8); if (nUnrolledSize > 0) { - register __m128d sum1 = _mm_set_pd(0.0, 0.0); - register __m128d sum2 = _mm_set_pd(0.0, 0.0); + __m128d sum1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); - const double* pCurrent = mpArray; - - for (; i < nUnrolledSize; i += 4) + for (; i < nUnrolledSize; i += 8) { - sum1 = _mm_add_pd(sum1, _mm_loadu_pd(pCurrent)); - pCurrent += 2; + __m128d load1 = _mm_load_pd(&pCurrent[i]); + sum1 = _mm_add_pd(sum1, load1); + + __m128d load2 = _mm_load_pd(&pCurrent[i + 2]); + sum2 = _mm_add_pd(sum2, load2); - sum2 = _mm_add_pd(sum2, _mm_loadu_pd(pCurrent)); - pCurrent += 2; + __m128d load3 = _mm_load_pd(&pCurrent[i + 4]); + sum3 = _mm_add_pd(sum3, load3); + + __m128d load4 = _mm_load_pd(&pCurrent[i + 6]); + sum4 = _mm_add_pd(sum4, load4); } - sum1 = _mm_add_pd(sum1, sum2); + sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4)); double temp; @@ -85,9 +106,10 @@ private: return fSum; } - inline double executeUnrolled(size_t& i) const + inline double executeUnrolled(size_t& i, const double* pCurrent) const { - size_t nUnrolledSize = mnSize - (mnSize % 4); + size_t nRealSize = mnSize - i; + size_t nUnrolledSize = nRealSize - (nRealSize % 4); if (nUnrolledSize > 0) { @@ -96,8 +118,6 @@ private: double sum2 = 0.0; double sum3 = 0.0; - const double* pCurrent = mpArray; - for (; i < nUnrolledSize; i += 4) { sum0 += *pCurrent++;
_______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits