sc/source/core/opencl/formulagroupcl.cxx | 70 +++++++++++++++++-------------- sc/source/core/opencl/oclkernels.hxx | 70 +++++++++++++++++++++++-------- sc/source/core/opencl/openclwrapper.cxx | 44 +++++++++++++------ 3 files changed, 121 insertions(+), 63 deletions(-)
New commits: commit 6c89aa8345e57c8c0b68b47bb67ac32b7dc930bc Author: Jing Xian <jingx...@multicorewareinc.com> Date: Thu Jul 11 11:35:54 2013 +0100 add min/max/delta kernels and misc. cleanup / bug fixing. diff --git a/sc/source/core/opencl/formulagroupcl.cxx b/sc/source/core/opencl/formulagroupcl.cxx index dee465e..8c2e236 100755 --- a/sc/source/core/opencl/formulagroupcl.cxx +++ b/sc/source/core/opencl/formulagroupcl.cxx @@ -82,16 +82,19 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& memset(rResult,0,rowSize); float * fpOclSrcData = NULL; // Point to the input data from CPU uint * npOclStartPos = NULL; // The first position for calculation,for example,the A1 in (=MAX(A1:A100)) - uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100)) - float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData) + uint * npOclEndPos = NULL; // The last position for calculation,for example, the A100 in (=MAX(A1:A100)) + float * fpLeftData = NULL; // Left input for binary operator(+,-,*,/),for example,(=leftData+rightData) float * fpRightData = NULL; // Right input for binary operator(+,-,*,/),for example,(=leftData/rightData) // The rightData can't be zero for "/" static OclCalc ocl_calc; - // Don't know how large the size will be applied previously, so create them as the rowSize or 65536 - // Don't know which formulae will be used previously, so create buffers for different formulae used probably - ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize); - ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize); - //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos); + if(ocl_calc.GetOpenclState()) + { + // Don't know how large the size will be applied previously, so create them as the rowSize or 65536 + // Don't know which formulae will be used previously, so create buffers for different formulae used probably + ocl_calc.CreateBuffer(fpOclSrcData,npOclStartPos,npOclEndPos,rowSize); + ocl_calc.CreateBuffer(fpLeftData,fpRightData,rowSize); + //printf("pptrr is %d,%d,%d\n",fpOclSrcData,npOclStartPos,npOclEndPos); + } /////////////////////////////////////////////////////////////////////////////////////////// // Until we implement group calculation for real, decompose the group into @@ -123,10 +126,11 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& nRowEnd += i; size_t nRowSize = nRowEnd - nRowStart + 1; ScMatrixRef pMat(new ScMatrix(nColSize, nRowSize, 0.0)); - - npOclStartPos[i] = nRowStart; // record the start position - npOclEndPos[i] = nRowEnd; // record the end position - + if(ocl_calc.GetOpenclState()) + { + npOclStartPos[i] = nRowStart; // record the start position + npOclEndPos[i] = nRowEnd; // record the end position + } for (size_t nCol = 0; nCol < nColSize; ++nCol) { const double* pArray = rArrays[nCol]; @@ -135,12 +139,14 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& fprintf(stderr,"Error: pArray is NULL!\n"); return false; } - - for( size_t u=0; u<rowSize; u++ ) + if(ocl_calc.GetOpenclState()) { - // Many video cards can't support double type in kernel, so need transfer the double to float - fpOclSrcData[u] = (float)pArray[u]; - //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]); + for( size_t u=nRowStart; u<=nRowEnd; u++ ) + { + // Many video cards can't support double type in kernel, so need transfer the double to float + fpOclSrcData[u] = (float)pArray[u]; + //fprintf(stderr,"fpOclSrcData[%d] is %f.\n",u,fpOclSrcData[u]); + } } for (size_t nRow = 0; nRow < nRowSize; ++nRow) @@ -165,22 +171,23 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& ScFormulaCell* pDest = rDoc.GetFormulaCell(aTmpPos); if (!pDest) return false; - - const formula::FormulaToken *pCur = aCode2.First(); - aCode2.Reset(); - while( ( pCur = aCode2.Next() ) != NULL ) + if(ocl_calc.GetOpenclState()) { - OpCode eOp = pCur->GetOpCode(); - if(eOp==0) + const formula::FormulaToken *pCur = aCode2.First(); + aCode2.Reset(); + while( ( pCur = aCode2.Next() ) != NULL ) { - if(nCount3%2==0) - fpLeftData[nCount1++] = (float)pCur->GetDouble(); - else - fpRightData[nCount2++] = (float)pCur->GetDouble(); - nCount3++; - } - else if( eOp!=ocOpen && eOp!=ocClose ) - nOclOp = eOp; + OpCode eOp = pCur->GetOpCode(); + if(eOp==0) + { + if(nCount3%2==0) + fpLeftData[nCount1++] = (float)pCur->GetDouble(); + else + fpRightData[nCount2++] = (float)pCur->GetDouble(); + nCount3++; + } + else if( eOp!=ocOpen && eOp!=ocClose ) + nOclOp = eOp; // if(count1>0){//dbg // fprintf(stderr,"leftData is %f.\n",leftData[count1-1]); @@ -190,11 +197,12 @@ bool FormulaGroupInterpreterOpenCL::interpret(ScDocument& rDoc, const ScAddress& // fprintf(stderr,"rightData is %f.\n",rightData[count2-1]); // count2--; // } + } } if(!getenv("SC_GPU")||!ocl_calc.GetOpenclState()) { - fprintf(stderr,"ccCPU flow...\n\n"); + //fprintf(stderr,"ccCPU flow...\n\n"); generateRPNCode(rDoc, aTmpPos, aCode2); ScInterpreter aInterpreter(pDest, &rDoc, aTmpPos, aCode2); aInterpreter.Interpret(); diff --git a/sc/source/core/opencl/oclkernels.hxx b/sc/source/core/opencl/oclkernels.hxx index e13c24a..c231dbd 100755 --- a/sc/source/core/opencl/oclkernels.hxx +++ b/sc/source/core/opencl/oclkernels.hxx @@ -162,32 +162,66 @@ __kernel void oclFormulaMinverse(__global float *data, } // Double precision is a requirement of spreadsheets -#if 0 -#if defined(cl_khr_fp64) // Khronos extension -#pragma OPENCL EXTENSION cl_khr_fp64 : enable -#elif defined(cl_amd_fp64) // AMD extension -#pragma OPENCL EXTENSION cl_amd_fp64 : enable -#endif -typedef double fp_t; -#else -typedef float fp_t; -#endif - -__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, __global int start, __global int end, __global fp_t *output) +// cl_khr_fp64: Khronos extension +// cl_amd_fp64: AMD extension +\n#if 0 \n +\n#if defined(cl_khr_fp64) \n +\n#pragma OPENCL EXTENSION cl_khr_fp64 : enable \n +\n#elif defined(cl_amd_fp64) \n +\n#pragma OPENCL EXTENSION cl_amd_fp64 : enable \n +\n#endif \n +\ntypedef double fp_t; \n +\n#else \n +\ntypedef float fp_t; \n +\n#endif \n + +__kernel void oclAverageDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output) { const unsigned int id = get_global_id(0); // Average - int i; - fp_t sum = 0.0; - for(i = start; i < end; i++) - sum += values[i]; - fp_t val = sum/(end-start); + fp_t fSum = 0.0f; + for(int i = start; i < end; i++) + fSum += values[i]; + fp_t fVal = fSum/(end-start); // Subtract & output - output[id] = val - subtract[id]; + output[id] = fVal - subtract[id]; } +__kernel void oclMaxDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + + // Max + float fMaxVal = values[start]; + for(int i=start+1;i < end;i++) + { + if(values[i]>fMaxVal) + fMaxVal = values[i]; + } + + // Subtract & output + output[id] = fMaxVal - subtract[id]; +} + +__kernel void oclMinDelta(__global fp_t *values, __global fp_t *subtract, uint start, uint end, __global fp_t *output) +{ + const unsigned int id = get_global_id(0); + + // Min + float fMinVal = values[start]; + for(int i=start+1;i < end;i++) + { + if(values[i]<fMinVal) + fMinVal = values[i]; + } + + // Subtract & output + output[id] = fMinVal - subtract[id]; +} + + ); #endif // USE_EXTERNAL_KERNEL diff --git a/sc/source/core/opencl/openclwrapper.cxx b/sc/source/core/opencl/openclwrapper.cxx index 6132ae3..781c838 100755 --- a/sc/source/core/opencl/openclwrapper.cxx +++ b/sc/source/core/opencl/openclwrapper.cxx @@ -106,6 +106,8 @@ int OpenclDevice::RegistOpenclKernel() AddKernelConfig(11, (const char*) "oclSignedMul"); AddKernelConfig(12, (const char*) "oclSignedDiv"); AddKernelConfig(13, (const char*) "oclAverageDelta"); + AddKernelConfig(14, (const char*) "OclMaxDelta"); + AddKernelConfig(15, (const char*) "OclMinDelta"); return 0; } @@ -499,18 +501,17 @@ int OpenclDevice::CompileKernelFile(GPUEnv *gpuInfo, const char *buildOption) { //char options[512]; // create a cl program executable for all the devices specified + printf("BuildProgram.\n"); if (!gpuInfo->mnIsUserCreated) { status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, buildOption, NULL, NULL); - CHECK_OPENCL(status) } else { status = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), buildOption, NULL, NULL); - CHECK_OPENCL(status) } - printf("BuildProgram.\n"); if (status != CL_SUCCESS) { + printf ("BuildProgram error!\n"); if (!gpuInfo->mnIsUserCreated) { status = clGetProgramBuildInfo(gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, @@ -678,7 +679,7 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo) &numDevices); if (status != CL_SUCCESS) { - return 1; + continue; } if (numDevices) { @@ -686,6 +687,8 @@ int OpenclDevice::InitOpenclRunEnv(GPUEnv *gpuInfo) } } } + if(status!=CL_SUCCESS) + return 1; free(platforms); } if (NULL == gpuInfo->mpPlatformID) { @@ -2102,7 +2105,7 @@ static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues, nElements * sizeof(double), NULL, pStatus); fp_t *pValues = (fp_t *)clEnqueueMapBuffer(rEnv.mpkCmdQueue,xValues,CL_TRUE,CL_MAP_WRITE,0, nElements * sizeof(fp_t),0,NULL,NULL,NULL); - for(int i=0;i<nElements;i++) + for(int i=0;i<(int)nElements;i++) pValues[i] = (fp_t)_pValues[i]; clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); @@ -2113,15 +2116,23 @@ static cl_mem allocateDoubleBuffer(KernelEnv &rEnv, const double *_pValues, double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, const double *pSubtractSingle, size_t nElements) { - KernelEnv kEnv; SetKernelEnv(&kEnv); // select a kernel: cut & paste coding is utterly evil. - const char *kernelName; + const char *kernelName = NULL; switch (eOp) { + case ocAdd: + case ocSub: + fprintf(stderr,"ocSub is %d\n",ocSub); + case ocMul: + case ocDiv: + ; // FIXME: fallthrough for now case ocMax: + kernelName = "oclMaxDelta"; + break; case ocMin: - ; // FIXME: fallthrough for now + kernelName = "oclMinDelta"; + break; case ocAverage: kernelName = "oclAverageDelta"; break; @@ -2133,15 +2144,20 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, cl_int clStatus; size_t global_work_size[1]; - kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram,kernelName, &clStatus); + kEnv.mpkKernel = clCreateKernel(kEnv.mpkProgram, kernelName, &clStatus); CHECK_OPENCL(clStatus); + if (!kEnv.mpkKernel) + { + fprintf(stderr, "could not clCreateKernel '%s'\n", kernelName); + return NULL; + } // Ugh - horrible redundant copying ... cl_mem valuesCl = allocateDoubleBuffer(kEnv, pOpArray, nElements, &clStatus); cl_mem subtractCl = allocateDoubleBuffer(kEnv, pSubtractSingle, nElements, &clStatus); - cl_int start = 0; - cl_int end = (cl_int) nElements; + cl_uint start = 0; + cl_uint end = (cl_uint) nElements; cl_mem outputCl = clCreateBuffer(kEnv.mpkContext, CL_MEM_READ_WRITE, nElements * sizeof(fp_t), @@ -2155,10 +2171,10 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, clStatus = clSetKernelArg(kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&subtractCl); CHECK_OPENCL(clStatus); - clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_int), + clStatus = clSetKernelArg(kEnv.mpkKernel, 2, sizeof(cl_uint), (void *)&start); CHECK_OPENCL(clStatus); - clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_int), + clStatus = clSetKernelArg(kEnv.mpkKernel, 3, sizeof(cl_uint), (void *)&end); CHECK_OPENCL(clStatus); clStatus = clSetKernelArg(kEnv.mpkKernel, 4, sizeof(cl_mem), @@ -2179,7 +2195,7 @@ double *OclCalc::OclSimpleDeltaOperation(OpCode eOp, const double *pOpArray, fp_t *pOutput = (fp_t *)clEnqueueMapBuffer(kEnv.mpkCmdQueue,outputCl,CL_TRUE, CL_MAP_READ,0,nElements*sizeof(fp_t), 0,NULL,NULL,NULL); - for(int i = 0; i < nElements; i++) + for(int i = 0; i < (int)nElements; i++) pResult[i] = (double)pOutput[i]; clEnqueueUnmapMemObject(kEnv.mpkCmdQueue,outputCl,pOutput,0,NULL,NULL); _______________________________________________ Libreoffice-commits mailing list libreoffice-comm...@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/libreoffice-commits