On Wed, Jan 22, 2014 at 04:21:42PM +0800, Yongjia Zhang wrote:
> From: Yongjia Zhang <zhang_yong_...@126.com>
> 
> This is a better way to accomplish the box-blur cl operation by using ocl's
> local memory from the opencv source code. It use the local shared memory to
> reduce global memory access, which significantly reduces the kernel's 
> processing
> time by 70 percent compared to the original one. Because of the barriers and
> local worksize limitation, processing with a radius larger than 110 becomes
> slower than original algorithm, so I keep the original kernels in order to 
> deal
> with box-blur with radius larger than 110.
> All the tests are based on Intel Beginet and Intel IvyBridge CPU and GPU.
> 
> Signed-off-by: Yongjia Zhang <yongjia.zh...@intel.com>
> ---
>  opencl/box-blur.cl           |  66 +++++++++++++++++++++++++
>  opencl/box-blur.cl.h         |  66 +++++++++++++++++++++++++
>  operations/common/box-blur.c | 115 
> ++++++++++++++++++++++++++-----------------
>  3 files changed, 201 insertions(+), 46 deletions(-)
> 
> diff --git a/opencl/box-blur.cl b/opencl/box-blur.cl
> index e99bea4..a1da9de 100644
> --- a/opencl/box-blur.cl
> +++ b/opencl/box-blur.cl
> @@ -43,3 +43,69 @@ __kernel void kernel_blur_ver (__global const float4     
> *aux,
>        out[out_index] = mean / (float)(2 * radius + 1);
>      }
>  }
> +
> +__kernel void kernel_box_blur_fast(const __global float4 *in,                
>        
> +                                   __global float4 *out,                     
>       
> +                                   __local float4 *column_sum,               
>        
> +                                   const int width,                         
> +                                   const int height,                        
> +                                   const int radius,                        
> +                                   const int size)                          
> +{                                                                           
> +     const int local_id0 = get_local_id(0);                                 
> +     const int twice_radius = 2 * radius;                                   
> +     const int in_width = twice_radius + width;                             
> +     const int in_height = twice_radius + height;                           
> +     const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );   
> +     int column_index_start,column_index_end;                               
> +     int y = get_global_id(1) * size;                                       
> +     const int out_x = get_group_id(0)                                      
> +               * ( get_local_size(0) - twice_radius ) + local_id0 - radius;  
>  
> +     const int in_x = out_x + radius;                                       
> +     int tmp_size = size;                                                   
> +     int tmp_index = 0;                                                     
> +     float4 tmp_sum = (float4)0.0f;                                         
> +     float4 total_sum = (float4)0.0f;                                       
> +     if( in_x < in_width )                                                  
> +     {                                                                      
> +             column_index_start = y;                                        
> +             column_index_end = y + twice_radius;                           
> +             for( int i=0; i<twice_radius+1; ++i )                          
> +                     tmp_sum+=in[(y+i)*in_width+in_x];                      
> +             column_sum[local_id0] = tmp_sum;                               
> +     }
> +
> +     barrier( CLK_LOCAL_MEM_FENCE );                                        
> +                                                                            
> +     while(1)                                                               
> +     {                                                                      
> +             if( out_x < width )                                            
> +             {                                                              
> +                     if( local_id0 >= radius                                
> +                                 && local_id0 < get_local_size(0) - radius ) 
>  
> +                     {                                                     
> +                             total_sum = (float4)0.0f;                      
> +                             for( int i=0; i<twice_radius+1; ++i )          
> +                                total_sum += column_sum[local_id0-radius+i];
> +                             out[y*width+out_x] = total_sum/area;           
> +                     }                                                      
> +             }                                                              
> +             if( --tmp_size ==0 || y == height - 1 )                        
> +                     break;
> +
> +             barrier( CLK_LOCAL_MEM_FENCE );

Is this barrier call guaranteed to be executed by all threads?  If not,
then this will produce undefined behavior.

-Tom

> +
> +             ++y;                                                           
> +             if( in_x < in_width )                                          
> +             {                                                              
> +                     tmp_sum = column_sum[local_id0];                       
> +                     tmp_sum -= in[(column_index_start)*in_width+in_x];     
> +                     tmp_sum += in[(column_index_end+1)*in_width+in_x];     
> +                     ++column_index_start;                                  
> +                     ++column_index_end;                                    
> +                     column_sum[local_id0] = tmp_sum;                       
> +             }
> +
> +             barrier( CLK_LOCAL_MEM_FENCE );                                
> +     }                                                                      
> +}                                                                           
> diff --git a/opencl/box-blur.cl.h b/opencl/box-blur.cl.h
> index bfed601..8f6aa81 100644
> --- a/opencl/box-blur.cl.h
> +++ b/opencl/box-blur.cl.h
> @@ -44,4 +44,70 @@ static const char* box_blur_cl_source =
>  "      out[out_index] = mean / (float)(2 * radius + 1);                      
>   \n"
>  "    }                                                                       
>   \n"
>  "}                                                                           
>   \n"
> +"                                                                            
>   \n"
> +"__kernel void kernel_box_blur_fast(const __global float4 *in,               
>   \n"
> +"                                   __global float4 *out,                    
>   \n"
> +"                                   __local float4 *column_sum,              
>   \n"
> +"                                   const int width,                         
>   \n"
> +"                                   const int height,                        
>   \n"
> +"                                   const int radius,                        
>   \n"
> +"                                   const int size)                          
>   \n"
> +"{                                                                           
>   \n"
> +"    const int local_id0 = get_local_id(0);                                  
>   \n"
> +"    const int twice_radius = 2 * radius;                                    
>   \n"
> +"    const int in_width = twice_radius + width;                              
>   \n"
> +"    const int in_height = twice_radius + height;                            
>   \n"
> +"    const float4 area = (float4)( (twice_radius+1) * (twice_radius+1) );    
>   \n"
> +"    int column_index_start,column_index_end;                                
>   \n"
> +"    int y = get_global_id(1) * size;                                        
>   \n"
> +"    const int out_x = get_group_id(0)                                       
>   \n"
> +"               * ( get_local_size(0) - twice_radius ) + local_id0 - radius; 
>   \n"
> +"    const int in_x = out_x + radius;                                        
>   \n"
> +"    int tmp_size = size;                                                    
>   \n"
> +"    int tmp_index = 0;                                                      
>   \n"
> +"    float4 tmp_sum = (float4)0.0f;                                          
>   \n"
> +"    float4 total_sum = (float4)0.0f;                                        
>   \n"
> +"    if( in_x < in_width )                                                   
>   \n"
> +"    {                                                                       
>   \n"
> +"        column_index_start = y;                                             
>   \n"
> +"        column_index_end = y + twice_radius;                                
>   \n"
> +"        for( int i=0; i<twice_radius+1; ++i )                               
>   \n"
> +"            tmp_sum+=in[(y+i)*in_width+in_x];                               
>   \n"
> +"        column_sum[local_id0] = tmp_sum;                                    
>   \n"
> +"    }                                                                       
>   \n"
> +"                                                                            
>   \n"
> +"    barrier( CLK_LOCAL_MEM_FENCE );                                         
>   \n"
> +"                                                                            
>   \n"
> +"    while(1)                                                                
>   \n"
> +"    {                                                                       
>   \n"
> +"        if( out_x < width )                                                 
>   \n"
> +"        {                                                                   
>   \n"
> +"            if( local_id0 >= radius                                         
>   \n"
> +"                                 && local_id0 < get_local_size(0) - radius 
> )  \n"
> +"            {                                                               
>   \n"
> +"                total_sum = (float4)0.0f;                                   
>   \n"
> +"                for( int i=0; i<twice_radius+1; ++i )                       
>   \n"
> +"                   total_sum += column_sum[local_id0-radius+i];             
>   \n"
> +"                out[y*width+out_x] = total_sum/area;                        
>   \n"
> +"            }                                                               
>   \n"
> +"        }                                                                   
>   \n"
> +"        if( --tmp_size ==0 || y == height - 1 )                             
>   \n"
> +"            break;                                                          
>   \n"
> +"                                                                            
>   \n"
> +"        barrier( CLK_LOCAL_MEM_FENCE );                                     
>   \n"
> +"                                                                            
>   \n"
> +"        ++y;                                                                
>   \n"
> +"        if( in_x < in_width )                                               
>   \n"
> +"        {                                                                   
>   \n"
> +"            tmp_sum = column_sum[local_id0];                                
>   \n"
> +"            tmp_sum -= in[(column_index_start)*in_width+in_x];              
>   \n"
> +"            tmp_sum += in[(column_index_end+1)*in_width+in_x];              
>   \n"
> +"            ++column_index_start;                                           
>   \n"
> +"            ++column_index_end;                                             
>   \n"
> +"            column_sum[local_id0] = tmp_sum;                                
>   \n"
> +"        }                                                                   
>   \n"
> +"                                                                            
>   \n"
> +"        barrier( CLK_LOCAL_MEM_FENCE );                                     
>   \n"
> +"    }                                                                       
>   \n"
> +"}                                                                           
>   \n"
>  ;
> diff --git a/operations/common/box-blur.c b/operations/common/box-blur.c
> index afc19ea..cb77ec0 100644
> --- a/operations/common/box-blur.c
> +++ b/operations/common/box-blur.c
> @@ -180,9 +180,7 @@ static void prepare (GeglOperation *operation)
>  #include "buffer/gegl-buffer-cl-iterator.h"
>  
>  #include "opencl/box-blur.cl.h"
> -
>  static GeglClRunData *cl_data = NULL;
> -
>  static gboolean
>  cl_box_blur (cl_mem                in_tex,
>               cl_mem                aux_tex,
> @@ -192,57 +190,82 @@ cl_box_blur (cl_mem                in_tex,
>               gint                  radius)
>  {
>    cl_int cl_err = 0;
> -  size_t global_ws_hor[2], global_ws_ver[2];
> -  size_t local_ws_hor[2], local_ws_ver[2];
> -
> +  size_t global_ws_hor[2], global_ws_ver[2], global_ws[2];
> +  size_t local_ws_hor[2], local_ws_ver[2], local_ws[2];
> +  size_t step_size ;
>    if (!cl_data)
>      {
> -      const char *kernel_name[] = {"kernel_blur_hor", "kernel_blur_ver", 
> NULL};
> +      const char *kernel_name[] = { "kernel_blur_hor", 
> "kernel_blur_ver","kernel_box_blur_fast", NULL};
>        cl_data = gegl_cl_compile_and_build (box_blur_cl_source, kernel_name);
>      }
>  
>    if (!cl_data)
>      return TRUE;
> -
> -  local_ws_hor[0] = 1;
> -  local_ws_hor[1] = 256;
> -  global_ws_hor[0] = roi->height + 2 * radius;
> -  global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * 
> local_ws_hor[1];
> -
> -  local_ws_ver[0] = 1;
> -  local_ws_ver[1] = 256;
> -  global_ws_ver[0] = roi->height;
> -  global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * 
> local_ws_ver[1];
> -
> -
> -  cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
> -                                    sizeof(cl_mem), (void*)&in_tex,
> -                                    sizeof(cl_mem), (void*)&aux_tex,
> -                                    sizeof(cl_int), (void*)&roi->width,
> -                                    sizeof(cl_int), (void*)&radius,
> -                                    NULL);
> -  CL_CHECK;
> -
> -  cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> -                                        cl_data->kernel[0], 2,
> -                                        NULL, global_ws_hor, local_ws_hor,
> -                                        0, NULL, NULL);
> -  CL_CHECK;
> -
> -
> -  cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
> -                                    sizeof(cl_mem), (void*)&aux_tex,
> -                                    sizeof(cl_mem), (void*)&out_tex,
> -                                    sizeof(cl_int), (void*)&roi->width,
> -                                    sizeof(cl_int), (void*)&radius,
> -                                    NULL);
> -  CL_CHECK;
> -
> -  cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> -                                        cl_data->kernel[1], 2,
> -                                        NULL, global_ws_ver, local_ws_ver,
> -                                        0, NULL, NULL);
> -  CL_CHECK;
> +  step_size = 64;
> +  local_ws[0]=256;
> +  local_ws[1]=1;
> +
> +
> +  if( radius <=110 )
> +  {
> +    global_ws[0] = (roi->width + local_ws[0] - 2 * radius - 1) / ( 
> local_ws[0] - 2 * radius ) * local_ws[0];
> +    global_ws[1] = (roi->height + step_size - 1) / step_size;
> +    cl_err = gegl_cl_set_kernel_args(cl_data->kernel[2],
> +                                     sizeof(cl_mem), (void *)&in_tex,
> +                                     sizeof(cl_mem), (void *)&out_tex,
> +                                     sizeof(cl_float4)*local_ws[0], (void 
> *)NULL,
> +                                     sizeof(cl_int), (void *)&roi->width,
> +                                     sizeof(cl_int), (void *)&roi->height,
> +                                     sizeof(cl_int), (void *)&radius,
> +                                     sizeof(cl_int), (void *)&step_size, 
> NULL);
> +    CL_CHECK;
> +    cl_err = gegl_clEnqueueNDRangeKernel(gegl_cl_get_command_queue(),
> +                                         cl_data->kernel[2], 2,
> +                                         NULL, global_ws, local_ws, 0, NULL, 
> NULL );
> +     CL_CHECK;
> +
> +  }
> +  else
> +  {
> +    local_ws_hor[0] = 1;
> +    local_ws_hor[1] = 256;
> +    global_ws_hor[0] = roi->height + 2 * radius;
> +    global_ws_hor[1] = ((roi->width + local_ws_hor[1] -1)/local_ws_hor[1]) * 
> local_ws_hor[1];
> +
> +    local_ws_ver[0] = 1;
> +    local_ws_ver[1] = 256;
> +    global_ws_ver[0] = roi->height;
> +    global_ws_ver[1] = ((roi->width + local_ws_ver[1] -1)/local_ws_ver[1]) * 
> local_ws_ver[1];
> +
> +
> +    cl_err = gegl_cl_set_kernel_args (cl_data->kernel[0],
> +                                      sizeof(cl_mem), (void*)&in_tex,
> +                                      sizeof(cl_mem), (void*)&aux_tex,
> +                                      sizeof(cl_int), (void*)&roi->width,
> +                                      sizeof(cl_int), (void*)&radius,
> +                                      NULL);
> +    CL_CHECK;
> +    cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> +                                          cl_data->kernel[0], 2,
> +                                          NULL, global_ws_hor, local_ws_hor,
> +                                          0, NULL, NULL);
> +    CL_CHECK;
> +
> +
> +    cl_err = gegl_cl_set_kernel_args (cl_data->kernel[1],
> +                                      sizeof(cl_mem), (void*)&aux_tex,
> +                                      sizeof(cl_mem), (void*)&out_tex,
> +                                      sizeof(cl_int), (void*)&roi->width,
> +                                      sizeof(cl_int), (void*)&radius,
> +                                      NULL);
> +    CL_CHECK;
> +
> +    cl_err = gegl_clEnqueueNDRangeKernel (gegl_cl_get_command_queue (),
> +                                          cl_data->kernel[1], 2,
> +                                          NULL, global_ws_ver, local_ws_ver,
> +                                          0, NULL, NULL);
> +    CL_CHECK;
> +  }
>  
>    return FALSE;
>  
> -- 
> 1.8.3.2
> 
> 
> _______________________________________________
> gegl-developer-list mailing list
> List address:    gegl-developer-list@gnome.org
> List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list
> 
_______________________________________________
gegl-developer-list mailing list
List address:    gegl-developer-list@gnome.org
List membership: https://mail.gnome.org/mailman/listinfo/gegl-developer-list

Reply via email to