arraysumfunctor: fast sum a double array, use for SUM() in Calc
This adds an array sum functor which sums a double array in a as fast as possible way. There are 2 implementations: SSE2 and a simple unrolled implementation. SSE2 implementation is used if SSE2 is detected at runtime. Additional info: SSE implementation at first processes the array until the array is aligned by 16-bit boundary (should only process 1 element). Then the array is processed by summing 8 values in one pass (using 4 variables that are 128-bit wide) where SSE operation can process 2 double values in one call. Change-Id: I24494b08cae049aa3eabcb086867f1bdd4128374
This commit is contained in:
committed by
Tomaž Vajngerl
parent
154bcd887d
commit
5493402fb3
141
sc/source/core/inc/arraysumfunctor.hxx
Normal file
141
sc/source/core/inc/arraysumfunctor.hxx
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
|
||||||
|
/*
|
||||||
|
* This file is part of the LibreOffice project.
|
||||||
|
*
|
||||||
|
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
|
||||||
|
#define INCLUDED_SC_SOURCE_CORE_INC_ARRAYSUMFUNCTOR_HXX
|
||||||
|
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#include <tools/cpuid.hxx>
|
||||||
|
|
||||||
|
namespace sc
|
||||||
|
{
|
||||||
|
|
||||||
|
template<typename T, unsigned int N>
|
||||||
|
inline bool isAligned(const T* pointer)
|
||||||
|
{
|
||||||
|
return 0 == (uintptr_t(pointer) % N);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ArraySumFunctor
|
||||||
|
{
|
||||||
|
private:
|
||||||
|
const double* mpArray;
|
||||||
|
size_t mnSize;
|
||||||
|
|
||||||
|
public:
|
||||||
|
ArraySumFunctor(const double* pArray, size_t nSize)
|
||||||
|
: mpArray(pArray)
|
||||||
|
, mnSize(nSize)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
double operator() ()
|
||||||
|
{
|
||||||
|
static bool hasSSE2 = tools::cpuid::hasSSE2();
|
||||||
|
|
||||||
|
double fSum = 0.0;
|
||||||
|
size_t i = 0;
|
||||||
|
const double* pCurrent = mpArray;
|
||||||
|
|
||||||
|
if (hasSSE2)
|
||||||
|
{
|
||||||
|
while (!isAligned<double, 16>(pCurrent))
|
||||||
|
{
|
||||||
|
fSum += *pCurrent++;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
fSum += executeSSE2(i, pCurrent);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
fSum += executeUnrolled(i, pCurrent);
|
||||||
|
|
||||||
|
// sum rest of the array
|
||||||
|
|
||||||
|
for (; i < mnSize; ++i)
|
||||||
|
fSum += mpArray[i];
|
||||||
|
|
||||||
|
return fSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
private:
|
||||||
|
inline double executeSSE2(size_t& i, const double* pCurrent) const
|
||||||
|
{
|
||||||
|
double fSum = 0.0;
|
||||||
|
size_t nRealSize = mnSize - i;
|
||||||
|
size_t nUnrolledSize = nRealSize - (nRealSize % 8);
|
||||||
|
|
||||||
|
if (nUnrolledSize > 0)
|
||||||
|
{
|
||||||
|
__m128d sum1 = _mm_setzero_pd();
|
||||||
|
__m128d sum2 = _mm_setzero_pd();
|
||||||
|
__m128d sum3 = _mm_setzero_pd();
|
||||||
|
__m128d sum4 = _mm_setzero_pd();
|
||||||
|
|
||||||
|
for (; i < nUnrolledSize; i += 8)
|
||||||
|
{
|
||||||
|
__m128d load1 = _mm_load_pd(pCurrent);
|
||||||
|
sum1 = _mm_add_pd(sum1, load1);
|
||||||
|
pCurrent += 2;
|
||||||
|
|
||||||
|
__m128d load2 = _mm_load_pd(pCurrent);
|
||||||
|
sum2 = _mm_add_pd(sum2, load2);
|
||||||
|
pCurrent += 2;
|
||||||
|
|
||||||
|
__m128d load3 = _mm_load_pd(pCurrent);
|
||||||
|
sum3 = _mm_add_pd(sum3, load3);
|
||||||
|
pCurrent += 2;
|
||||||
|
|
||||||
|
__m128d load4 = _mm_load_pd(pCurrent);
|
||||||
|
sum4 = _mm_add_pd(sum4, load4);
|
||||||
|
pCurrent += 2;
|
||||||
|
}
|
||||||
|
sum1 = _mm_add_pd(_mm_add_pd(sum1, sum2), _mm_add_pd(sum3, sum4));
|
||||||
|
|
||||||
|
double temp;
|
||||||
|
|
||||||
|
_mm_storel_pd(&temp, sum1);
|
||||||
|
fSum += temp;
|
||||||
|
|
||||||
|
_mm_storeh_pd(&temp, sum1);
|
||||||
|
fSum += temp;
|
||||||
|
}
|
||||||
|
return fSum;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline double executeUnrolled(size_t& i, const double* pCurrent) const
|
||||||
|
{
|
||||||
|
size_t nRealSize = mnSize - i;
|
||||||
|
size_t nUnrolledSize = nRealSize - (nRealSize % 4);
|
||||||
|
|
||||||
|
if (nUnrolledSize > 0)
|
||||||
|
{
|
||||||
|
double sum0 = 0.0;
|
||||||
|
double sum1 = 0.0;
|
||||||
|
double sum2 = 0.0;
|
||||||
|
double sum3 = 0.0;
|
||||||
|
|
||||||
|
for (; i < nUnrolledSize; i += 4)
|
||||||
|
{
|
||||||
|
sum0 += *pCurrent++;
|
||||||
|
sum1 += *pCurrent++;
|
||||||
|
sum2 += *pCurrent++;
|
||||||
|
sum3 += *pCurrent++;
|
||||||
|
}
|
||||||
|
return sum0 + sum1 + sum2 + sum3;
|
||||||
|
}
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end namespace sc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
|
@@ -26,6 +26,8 @@
|
|||||||
#include "mtvcellfunc.hxx"
|
#include "mtvcellfunc.hxx"
|
||||||
#include "scmatrix.hxx"
|
#include "scmatrix.hxx"
|
||||||
|
|
||||||
|
#include "arraysumfunctor.hxx"
|
||||||
|
|
||||||
#include <formula/token.hxx>
|
#include <formula/token.hxx>
|
||||||
|
|
||||||
using namespace formula;
|
using namespace formula;
|
||||||
@@ -235,18 +237,9 @@ public:
|
|||||||
if (nDataSize == 0)
|
if (nDataSize == 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
size_t nUnrolled = (nDataSize & 0x3) >> 2;
|
sc::ArraySumFunctor functor(p, nDataSize);
|
||||||
|
|
||||||
// Try to encourage the compiler/CPU to do something sensible for the next.
|
mfRest += functor();
|
||||||
for (i = 0; i < nUnrolled; i+=4)
|
|
||||||
{
|
|
||||||
mfRest += p[i];
|
|
||||||
mfRest += p[i+1];
|
|
||||||
mfRest += p[i+2];
|
|
||||||
mfRest += p[i+3];
|
|
||||||
}
|
|
||||||
for (; i < nDataSize; ++i)
|
|
||||||
mfRest += p[i];
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user