OK, I just accepted the patch.
On Wed, Jan 22, 2014 at 6:45 PM, Yongjia Zhang <zhang_yong_...@126.com> wrote:
> From: Yongjia Zhang <zhang_yong_...@126.com>
>
> This is a better way to accomplish the box-blur cl operation by using ocl's
> local memory from the opencv source code. It use the local shared memory to
> reduce global memory access, which significantly reduces the kernel's
> processing
> time by 70 percent compared to the original one. Because of the barriers and
> local worksize limitation, processing with a radius larger than 110 becomes
> slower than original algorithm, so I keep the original kernels in order to
> deal
> with box-blur with radius larger than 110.
> All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.
>
> v2:add kernel attribute to restrict the local size to (256,1,1).
>
> Signed-off-by: Yongjia Zhang<yongjia.zh...@intel.com>
> ---
> opencl/box-blur.cl | 68 +++++++++++++++++++++++++
> opencl/box-blur.cl.h | 68 +++++++++++++++++++++++++
> operations/common/box-blur.c | 115
> ++++++++++++++++++++++++++-----------------
> 3 files changed, 205 insertions(+), 46 deletions(-)
>
> diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl
> index e99bea4..0d64c89 100644
> --- a/opencl/box-blur.cl
> +++ b/opencl/box-blur.cl
> @@ -43,3 +43,71 @@ __kernel void kernel_blur_ver (__global const float4
> *aux,
> out[out_index] = mean / (float)(2 * radius + 1);
> }
> }
> +
> +__kernel
> +__attibute__((reqd_work_group_size(256,1,1)))
> +void kernel_box_blur_fast(const __global float4 *in,
> + __global float4 *out,
> + __local float4 *column_sum,
> + const int width,
> + const int height,
> + const int radius,
> + const int size)
> +{
> + const int local_id0 = get_local_id(0);
> + const int twice_radius = 2 * radius;
> + const int in_width = twice_radius + width;
> + const int in_height = twice_radius + height;
> + const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );
> + int column_index_start,column_index_end;
> + int y = get_global_id(1) * size;
> + const int out_x = get_group_id(0)
> + * ( get_local_size(0) - twice_radius ) + local_id0 - radius;
> + const int in_x = out_x + radius;
> + int tmp_size = size;
> + int tmp_index = 0;
> + float4 tmp_sum = (float4)0.0f;
> + float4 total_sum = (float4)0.0f;
> + if( in_x < in_width )
> + {
> + column_index_start = y;
> + column_index_end = y + twice_radius;
> + for( int i=0; i<twice_radius+1; ++i )
> + tmp_sum+=in[(y+i)*in_width+in_x];
> + column_sum[local_id0] = tmp_sum;
> + }
> +
> + barrier( CLK_LOCAL_MEM_FENCE );
> +
> + while(1)
> + {
> + if( out_x < width )
> + {
> + if( local_id0 >= radius
> + && local_id0 < get_local_size(0) - radius )
> + {
> + total_sum = (float4)0.0f;
> + for( int i=0; i<twice_radius+1; ++i )
> + total_sum +=
> column_sum[local_id0-radius+i];
> + out[y*width+out_x] = total_sum/area;
> + }
> + }
> + if( --tmp_size ==0 || y == height - 1 )
> + break;
> +
> + barrier( CLK_LOCAL_MEM_FENCE );
> +
> + ++y;
> + if( in_x < in_width )
> + {
> + tmp_sum = column_sum[local_id0];
> + tmp_sum -= in[(column_index_start)*in_width+in_x];
> + tmp_sum += in[(column_index_end+1)*in_width+in_x];
> + ++column_index_start;
> + ++column_index_end;
> + column_sum[local_id0] = tmp_sum;
> + }
> +
> + barrier( CLK_LOCAL_MEM_FENCE );
> + }
> +}
> diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h
> index bfed601..e4585ec 100644
> --- a/opencl/box-blur.cl.h
> +++ b/opencl/box-blur.cl.h
> @@ -44,4 +44,72 @@ static const char* box_blur_cl_source =
> " out[out_index] = mean / (float)(2 * radius + 1);
> \n"
> " }
> \n"
> "}
> \n"
> +"
> \n"
> +"__kernel
> \n"
> +"__attribute__((reqd_work_group_size(256,1,1)))
> \n"
> +"void kernel_box_blur_fast(const __global float4 *in,
> \n"
> +" __global float4 *out,
> \n"
> +" __local float4 *column_sum,
> \n"
> +" const int width,
> \n"
> +" const int height,
> \n"
> +" const int radius,
> \n"
> +" const int size)
> \n"
> +"{
> \n"
> +" const int local_id0 = get_local_id(0);
> \n"
> +" const int twice_radius = 2 * radius;
> \n"
> +" const int in_width = twice_radius + width;
> \n"
> +" const int in_height = twice_radius + height;
> \n"
> +" const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );
> \n"
> +" int column_index_start,column_index_end;
> \n"
> +" int y = get_global_id(1) * size;
> \n"
> +" const int out_x = get_group_id(0)
> \n"
> +" * ( get_local_size(0) - twice_radius ) + local_id0 - radius;
> \n"
> +" const int in_x = out_x + radius;
> \n"
> +" int tmp_size = size;
> \n"
> +" int tmp_index = 0;
> \n"
> +" float4 tmp_sum = (float4)0.0f;
> \n"
> +" float4 total_sum = (float4)0.0f;
> \n"
> +" if( in_x < in_width )
> \n"
> +" {
> \n"
> +" column_index_start = y;
> \n"
> +" column_index_end = y + twice_radius;
> \n"
> +" for( int i=0; i<twice_radius+1; ++i )
> \n"
> +" tmp_sum+=in[(y+i)*in_width+in_x];
> \n"
> +" column_sum[local_id0] = tmp_sum;
> \n"
> +" }
> \n"
> +"
> \n"
> +" barrier( CLK_LOCAL_MEM_FENCE );
> \n"
> +"
> \n"
> +" while(1)
> \n"
> +" {
> \n"
> +" if( out_x < width )
> \n"
> +" {
> \n"
> +" if( local_id0 >= radius
> \n"
> +" && local_id0 < get_local_size(0) - radius
> ) \n"
> +" {
> \n"
> +" total_sum = (float4)0.0f;
> \n"
> +" for( int i=0; i<twice_radius+1; ++i )
> \n"
> +" total_sum += column_sum[local_id0-radius+i];
> \n"
> +" out[y*width+out_x] = total_sum/area;
> \n"
> +" }
> \n"
> +" }
> \n"
> +" if( --tmp_size ==0 || y == height - 1 )
> \n"
> +" break;
> \n"
> +"
> \n"
> +" barrier( CLK_LOCAL_MEM_FENCE );
> \n"
> +"
> \n"
> +" ++y;
> \n"
> +" if( in_x < in_width )
> \n"
> +" {
> \n"
> +" tmp_sum = column_sum[local_id0];
> \n"
> +" tmp_sum -= in[(column_index_start)*in_width+in_x];
> \n"
> +" tmp_sum += in[(column_index_end+1)*in_width+in_x];
> \n"
> +" ++column_index_start;
> \n"
> +" ++column_index_end;
> \n"
> +" column_sum[local_id0] = tmp_sum;
> \n"
> +" }
> \n"
> +"
> \n"
> +" barrier( CLK_LOCAL_MEM_FENCE );
> \n"
> +" }
> \n"
> +"}
> \n"
> ;
> diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c
> index afc19ea..cb77ec0 100644
> --- a/operations/common/box-blur.c
> +++ b/operations/common/box-blur.c
> @@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation)
> #include "buffer/gegl-buffer-cl-iterator.h"
>
> #include "opencl/box-blur.cl.h"
> -
> static GeglClRunData *cl_data = NULL;
> -
> static gboolean
> cl_box_blur (cl_mem in_tex,
> cl_mem aux_tex,
> @@ -192,57 +190,82 @@ cl_box_blur (cl_mem in_tex,
> gint radius)
> {
> cl_int cl_err = 0;
> - size_t global_ws_hor[2], global_ws_ver[2];
> - size_t local_ws_hor[2], local_ws_ver[2];
> -
> + size_t global_ws_hor[2], global_ws_ver[2], global_ws[2];
> + size_t local_ws_hor[2], local_ws_ver[2], local_ws[2];
> + size_t step_size ;
> if (!cl_data)
> {
> - const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver",
> NULL};
> + const char *kernel_name[] = { "kernel_blur_hor",
> "kernel_blur_ver","kernel_box_blur_fast", NULL};
> cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name);
> }
>
> if (!cl_data)
> return TRUE;
> -
> - local_ws_hor[0] = 1;
> - local_ws_hor[1] = 256;
> - global_ws_hor[0] = roi->height + 2 * radius;
> - global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) *
> local_ws_hor[1];
> -
> - local_ws_ver[0] = 1;
> - local_ws_ver[1] = 256;
> - global_ws_ver[0] = roi->height;
> - global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) *
> local_ws_ver[1];
> -
> -
> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
> - sizeof(cl_mem), (void*)&in_tex,
> - sizeof(cl_mem), (void*)&aux_tex,
> - sizeof(cl_int), (void*)&roi->width,
> - sizeof(cl_int), (void*)&radius,
> - NULL);
> - CL_CHECK;
> -
> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> - cl_data->kernel[0], 2,
> - NULL, global_ws_hor, local_ws_hor,
> - 0, NULL, NULL);
> - CL_CHECK;
> -
> -
> - cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
> - sizeof(cl_mem), (void*)&aux_tex,
> - sizeof(cl_mem), (void*)&out_tex,
> - sizeof(cl_int), (void*)&roi->width,
> - sizeof(cl_int), (void*)&radius,
> - NULL);
> - CL_CHECK;
> -
> - cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> - cl_data->kernel[1], 2,
> - NULL, global_ws_ver, local_ws_ver,
> - 0, NULL, NULL);
> - CL_CHECK;
> + step_size = 64;
> + local_ws[0]=256;
> + local_ws[1]=1;
> +
> +
> + if( radius <=110 )
> + {
> + global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / (
> local_ws[0] - 2 * radius ) * local_ws[0];
> + global_ws[1] = (roi->height + step_size - 1) / step_size;
> + cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2],
> + sizeof(cl_mem), (void *)&in_tex,
> + sizeof(cl_mem), (void *)&out_tex,
> + sizeof(cl_float4)*local_ws[0], (void
> *)NULL,
> + sizeof(cl_int), (void *)&roi->width,
> + sizeof(cl_int), (void *)&roi->height,
> + sizeof(cl_int), (void *)&radius,
> + sizeof(cl_int), (void *)&step_size,
> NULL);
> + CL_CHECK;
> + cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(),
> + cl_data->kernel[2], 2,
> + NULL, global_ws, local_ws, 0, NULL,
> NULL );
> + CL_CHECK;
> +
> + }
> + else
> + {
> + local_ws_hor[0] = 1;
> + local_ws_hor[1] = 256;
> + global_ws_hor[0] = roi->height + 2 * radius;
> + global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) *
> local_ws_hor[1];
> +
> + local_ws_ver[0] = 1;
> + local_ws_ver[1] = 256;
> + global_ws_ver[0] = roi->height;
> + global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) *
> local_ws_ver[1];
> +
> +
> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
> + sizeof(cl_mem), (void*)&in_tex,
> + sizeof(cl_mem), (void*)&aux_tex,
> + sizeof(cl_int), (void*)&roi->width,
> + sizeof(cl_int), (void*)&radius,
> + NULL);
> + CL_CHECK;
> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> + cl_data->kernel[0], 2,
> + NULL, global_ws_hor, local_ws_hor,
> + 0, NULL, NULL);
> + CL_CHECK;
> +
> +
> + cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
> + sizeof(cl_mem), (void*)&aux_tex,
> + sizeof(cl_mem), (void*)&out_tex,
> + sizeof(cl_int), (void*)&roi->width,
> + sizeof(cl_int), (void*)&radius,
> + NULL);
> + CL_CHECK;
> +
> + cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> + cl_data->kernel[1], 2,
> + NULL, global_ws_ver, local_ws_ver,
> + 0, NULL, NULL);
> + CL_CHECK;
> + }
>
> return FALSE;
>
> --
> 1.8.3.2
>
>
> _______________________________________________
> gegl-developer-list mailing list
> List address: gegl-developer-list@gnome.org
> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list
>
_______________________________________________
gegl-developer-list mailing list
List address: gegl-developer-list@gnome.org
List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list