2014-12-18 8:14 GMT+08:00 Titov, Alexey <alexey.ti...@amd.com>: > > Hi, here is a patch for optimized OpenCL implementation for > libavfilter/unsharpen filter. > This implementation leverages hardware acceleration where possible. > > Regards, > Alexey > > --- > libavfilter/unsharp.h | 4 ++ > libavfilter/unsharp_opencl.c | 76 +++++++++++++++------- > libavfilter/unsharp_opencl_kernel.h | 122 > ++++++++++++++++++++++++++---------- > libavutil/opencl.c | 19 +++++- > 4 files changed, 165 insertions(+), 56 deletions(-) > > diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h > index c2aed64..fc651c0 100644 > --- a/libavfilter/unsharp.h > +++ b/libavfilter/unsharp.h > @@ -41,6 +41,10 @@ typedef struct { > cl_kernel kernel_chroma; > cl_mem cl_luma_mask; > cl_mem cl_chroma_mask; > + cl_mem cl_luma_mask_x; > + cl_mem cl_chroma_mask_x; > + cl_mem cl_luma_mask_y; > + cl_mem cl_chroma_mask_y; > int in_plane_size[8]; > int out_plane_size[8]; > int plane_num; > diff --git a/libavfilter/unsharp_opencl.c b/libavfilter/unsharp_opencl.c > index 5c6b5ef..a99fc5b 100644 > --- a/libavfilter/unsharp_opencl.c > +++ b/libavfilter/unsharp_opencl.c > @@ -87,42 +87,36 @@ end: > return ret; > } > -static int compute_mask_matrix(cl_mem cl_mask_matrix, int step_x, int > step_y) > +static int copy_separable_masks(cl_mem cl_mask_x, cl_mem cl_mask_y, int > step_x, int step_y) > { > - int i, j, ret = 0; > - uint32_t *mask_matrix, *mask_x, *mask_y; > - size_t size_matrix = sizeof(uint32_t) * (2 * step_x + 1) * (2 * > step_y + 1); > - mask_x = av_mallocz_array(2 * step_x + 1, sizeof(uint32_t)); > + int ret = 0; > + uint32_t *mask_x, *mask_y; > + size_t size_mask_x = sizeof(uint32_t) * (2 * step_x + 1); > + size_t size_mask_y = sizeof(uint32_t) * (2 * step_y + 1); > + mask_x = av_mallocz_array(size_mask_x); > if (!mask_x) { > ret = AVERROR(ENOMEM); > goto end; > } > - mask_y = av_mallocz_array(2 * step_y + 1, sizeof(uint32_t)); > + mask_y = av_mallocz_array(size_mask_y); > if (!mask_y) { > ret = AVERROR(ENOMEM); > goto end; > } > - mask_matrix = av_mallocz(size_matrix); > - if (!mask_matrix) { > - ret = AVERROR(ENOMEM); > - goto end; > - } > + > ret = compute_mask(step_x, mask_x); > if (ret < 0) > goto end; > ret = compute_mask(step_y, mask_y); > if (ret < 0) > goto end; > - for (j = 0; j < 2 * step_y + 1; j++) { > - for (i = 0; i < 2 * step_x + 1; i++) { > - mask_matrix[i + j * (2 * step_x + 1)] = mask_y[j] * mask_x[i]; > - } > - } > - ret = av_opencl_buffer_write(cl_mask_matrix, (uint8_t *)mask_matrix, > size_matrix); > + > + ret = av_opencl_buffer_write(cl_mask_x, (uint8_t *)mask_x, > size_mask_x); > + ret = av_opencl_buffer_write(cl_mask_y, (uint8_t *)mask_y, > size_mask_y); > end: > av_freep(&mask_x); > av_freep(&mask_y); > - av_freep(&mask_matrix); > + > return ret; > } > @@ -133,6 +127,11 @@ static int generate_mask(AVFilterContext *ctx) > cl_mem mask_matrix[2]; > mask_matrix[0] = unsharp->opencl_ctx.cl_luma_mask; > mask_matrix[1] = unsharp->opencl_ctx.cl_chroma_mask; > + cl_mem masks[4]; > + masks[0] = unsharp->opencl_ctx.cl_luma_mask_x; > + masks[1] = unsharp->opencl_ctx.cl_luma_mask_y; > + masks[2] = unsharp->opencl_ctx.cl_chroma_mask_x; > + masks[3] = unsharp->opencl_ctx.cl_chroma_mask_y; > step_x[0] = unsharp->luma.steps_x; > step_x[1] = unsharp->chroma.steps_x; > step_y[0] = unsharp->luma.steps_y; > @@ -144,12 +143,16 @@ static int generate_mask(AVFilterContext *ctx) > else > unsharp->opencl_ctx.use_fast_kernels = 1; > + if (!masks[0] || !masks[1] || !masks[2] || !masks[3]) { > + av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not > be NULL\n"); > + return AVERROR(EINVAL); > + } > if (!mask_matrix[0] || !mask_matrix[1]) { > av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not > be NULL\n"); > return AVERROR(EINVAL); > } > for (i = 0; i < 2; i++) { > - ret = compute_mask_matrix(mask_matrix[i], step_x[i], step_y[i]); > + ret = copy_separable_masks(masks[2*i], masks[2*i+1], step_x[i], > step_y[i]); > if (ret < 0) > return ret; > } > @@ -184,7 +187,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > ret = avpriv_opencl_set_parameter(&kernel1, > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), > - > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_x), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_y), > > FF_OPENCL_PARAM_INFO(unsharp->luma.amount), > > FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits), > > FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale), > @@ -201,7 +205,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > ret = avpriv_opencl_set_parameter(&kernel2, > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), > - > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_x), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_y), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.amount), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale), > @@ -264,7 +269,9 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > return AVERROR_EXTERNAL; > } > } > - clFinish(unsharp->opencl_ctx.command_queue); > + //blocking map is suffficient, no need for clFinish > + //clFinish(unsharp->opencl_ctx.command_queue); > + > return av_opencl_buffer_read_image(out->data, > unsharp->opencl_ctx.out_plane_size, > unsharp->opencl_ctx.plane_num, > unsharp->opencl_ctx.cl_outbuf, > > unsharp->opencl_ctx.cl_outbuf_size); > @@ -286,6 +293,27 @@ int ff_opencl_unsharp_init(AVFilterContext *ctx) > ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask, > sizeof(uint32_t) * (2 * > unsharp->chroma.steps_x + 1) * (2 * unsharp->chroma.steps_y + 1), > CL_MEM_READ_ONLY, NULL); > + // separable filters > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_x, > + sizeof(uint32_t) * (2 * > unsharp->luma.steps_x + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_y, > + sizeof(uint32_t) * (2 * > unsharp->luma.steps_y + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_x, > + sizeof(uint32_t) * (2 * > unsharp->chroma.steps_x + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_y, > + sizeof(uint32_t) * (2 * > unsharp->chroma.steps_y + 1), > + CL_MEM_READ_ONLY, NULL); > if (ret < 0) > return ret; > ret = generate_mask(ctx); > @@ -339,6 +367,10 @@ void ff_opencl_unsharp_uninit(AVFilterContext *ctx) > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_outbuf); > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask); > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_x); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_x); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_y); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_y); > clReleaseKernel(unsharp->opencl_ctx.kernel_default); > clReleaseKernel(unsharp->opencl_ctx.kernel_luma); > clReleaseKernel(unsharp->opencl_ctx.kernel_chroma); > diff --git a/libavfilter/unsharp_opencl_kernel.h > b/libavfilter/unsharp_opencl_kernel.h > index 9c4fd65..e013e2f 100644 > --- a/libavfilter/unsharp_opencl_kernel.h > +++ b/libavfilter/unsharp_opencl_kernel.h > @@ -36,7 +36,8 @@ inline unsigned char clip_uint8(int a) > kernel void unsharp_luma( > global unsigned char *src, > global unsigned char *dst, > - global int *mask, > + global int *mask_x, > + global int *mask_y, > int amount, > int scalebits, > int halfscale, > @@ -59,10 +60,12 @@ kernel void unsharp_luma( > return; > } > - local uchar l[32][32]; > - local int lc[LU_RADIUS_X*LU_RADIUS_Y]; > + local short l[32][32]; > + local int lcx[LU_RADIUS_X]; > + local int lcy[LU_RADIUS_Y]; > int indexIx, indexIy, i, j; > + //load up tile: actual workspace + halo of 8 points in x and y \n > for(i = 0; i <= 1; i++) { > indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; > indexIy = indexIy < 0 ? 0 : indexIy; > @@ -76,27 +79,54 @@ kernel void unsharp_luma( > } > int indexL = threadIdx.y*16 + threadIdx.x; > - if (indexL < LU_RADIUS_X*LU_RADIUS_Y) > - lc[indexL] = mask[indexL]; > + if (indexL < LU_RADIUS_X) > + lcx[indexL] = mask_x[indexL]; > + if (indexL < LU_RADIUS_Y) > + lcy[indexL] = mask_y[indexL]; > barrier(CLK_LOCAL_MEM_FENCE); > + //needed for unsharp mask application in the end \n > + int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > + > int idx, idy, maskIndex; > - int sum = 0; > - int steps_x = LU_RADIUS_X/2; > - int steps_y = LU_RADIUS_Y/2; > + int temp[2] = {0,0}; > + int steps_x = (LU_RADIUS_X-1)/2; > + int steps_y = (LU_RADIUS_Y-1)/2; > - \n#pragma unroll\n > - for (i = -steps_y; i <= steps_y; i++) { > - idy = 8 + i + threadIdx.y; > - \n#pragma unroll\n > - for (j = -steps_x; j <= steps_x; j++) { > - idx = 8 + j + threadIdx.x; > - maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x; > - sum += (int)l[idy][idx] * lc[maskIndex]; > + // compute the actual workspace + left&right halos \n > + \n#pragma unroll\n > + for (j = 0; j <=1; j++) { > + //extra work to cover left and right halos \n > + idx = 16*j + threadIdx.x; > + \n#pragma unroll\n > + for (i = -steps_y; i <= steps_y; i++) { > + idy = 8 + i + threadIdx.y; > + maskIndex = (i + steps_y); > + temp[j] += (int)l[idy][idx] * lcy[maskIndex]; > } > } > - int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > - int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * > amount) >> 16); > + barrier(CLK_LOCAL_MEM_FENCE); > + //save results from the vertical filter in local memory \n > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = 0; j <=1; j++) { > + idx = 16*j + threadIdx.x; > + l[idy][idx] = temp[j]; > + } > + barrier(CLK_LOCAL_MEM_FENCE); > + > + //compute results with the horizontal filter \n > + int sum = 0; > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = -steps_x; j <= steps_x; j++) { > + idx = 8 + j + threadIdx.x; > + maskIndex = j + steps_x; > + sum += (int)l[idy][idx] * lcx[maskIndex]; > + } > + > + int res = orig_value + (((orig_value - (int)((sum + halfscale) >> > scalebits)) * amount) >> 16); > + > if (globalIdx.x < width && globalIdx.y < height) > dst[globalIdx.x + globalIdx.y*dst_stride] = clip_uint8(res); > } > @@ -104,7 +134,8 @@ kernel void unsharp_luma( > kernel void unsharp_chroma( > global unsigned char *src_y, > global unsigned char *dst_y, > - global int *mask, > + global int *mask_x, > + global int *mask_y, > int amount, > int scalebits, > int halfscale, > @@ -141,8 +172,9 @@ kernel void unsharp_chroma( > return; > } > - local uchar l[32][32]; > - local int lc[CH_RADIUS_X*CH_RADIUS_Y]; > + local ushort l[32][32]; > + local int lcx[CH_RADIUS_X]; > + local int lcy[CH_RADIUS_Y]; > int indexIx, indexIy, i, j; > for(i = 0; i <= 1; i++) { > indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; > @@ -157,27 +189,51 @@ kernel void unsharp_chroma( > } > int indexL = threadIdx.y*16 + threadIdx.x; > - if (indexL < CH_RADIUS_X*CH_RADIUS_Y) > - lc[indexL] = mask[indexL]; > + if (indexL < CH_RADIUS_X) > + lcx[indexL] = mask_x[indexL]; > + if (indexL < CH_RADIUS_Y) > + lcy[indexL] = mask_y[indexL]; > barrier(CLK_LOCAL_MEM_FENCE); > + int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > + > int idx, idy, maskIndex; > - int sum = 0; > int steps_x = CH_RADIUS_X/2; > int steps_y = CH_RADIUS_Y/2; > + int temp[2] = {0,0}; > \n#pragma unroll\n > - for (i = -steps_y; i <= steps_y; i++) { > - idy = 8 + i + threadIdx.y; > + for (j = 0; j <= 1; j++) { > + idx = 16*j + threadIdx.x; > \n#pragma unroll\n > - for (j = -steps_x; j <= steps_x; j++) { > - idx = 8 + j + threadIdx.x; > - maskIndex = (i + steps_y)*CH_RADIUS_X + j + steps_x; > - sum += (int)l[idy][idx] * lc[maskIndex]; > - } > + for (i = -steps_y; i <= steps_y; i++) { > + idy = 8 + i + threadIdx.y; > + maskIndex = i + steps_y; > + temp[j] += (int)l[idy][idx] * lcy[maskIndex]; > + } > + } > + > + barrier(CLK_LOCAL_MEM_FENCE); > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = 0; j <= 1; j++) { > + idx = 16*j + threadIdx.x; > + l[idy][idx] = temp[j]; > } > - int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > - int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * > amount) >> 16); > + barrier(CLK_LOCAL_MEM_FENCE); > + > + //compute results with the horizontal filter \n > + int sum = 0; > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = -steps_x; j <= steps_x; j++) { > + idx = 8 + j + threadIdx.x; > + maskIndex = j + steps_x; > + sum += (int)l[idy][idx] * lcx[maskIndex]; > + } > + > + int res = orig_value + (((orig_value - (int)((sum + halfscale) >> > scalebits)) * amount) >> 16); > + > if (globalIdx.x < cw && globalIdx.y < ch) > dst[globalIdx.x + globalIdx.y*dst_stride_ch] = clip_uint8(res); > } > diff --git a/libavutil/opencl.c b/libavutil/opencl.c > index 36cb6fe..738d0db 100644 > --- a/libavutil/opencl.c > +++ b/libavutil/opencl.c > @@ -450,7 +450,24 @@ cl_program av_opencl_compile(const char > *program_name, const char *build_opts) > status = clBuildProgram(program, 1, &(opencl_ctx.device_id), > build_opts, NULL, NULL); > if (status != CL_SUCCESS) { > av_log(&opencl_ctx, AV_LOG_ERROR, > - "Compilation failed with OpenCL program: %s\n", > program_name); > + "Compilation failed with OpenCL program: '%s' with error %d > \n", program_name, status); > + > + // Determine the size of the log > + size_t log_size; > + clGetProgramBuildInfo(program, &(opencl_ctx.device_id), > CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); > + > + // Allocate memory for the log > + char *log = (char *) malloc(log_size+1); > + > + // Get the log > + clGetProgramBuildInfo(program, &(opencl_ctx.device_id), > CL_PROGRAM_BUILD_LOG, log_size, log, NULL); > + log[log_size] = '\0'; > + printf("--- Build log ---\n "); > + // Print the log > + printf("%s\n", log); > + printf("--- End Build log ---\n "); > + > + free(log); > program = NULL; > goto end; > } > -- > 1.8.4.msysgit.0 > > Hi
Merged OK on mingw, but compile error, the sha1 ID which based on is 61296d41e2de3b41304339e4631dd44c2e15f805 $ make -j4 CC libavfilter/unsharp_opencl.o e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavfilter/unsharp_opencl.c: In function 'copy_separable_masks': e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavfilter/unsharp_opencl.c:96:5: error: too few arguments to function 'av_mallocz_array' e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavutil/mem.h:228:41: note: declared here e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavfilter/unsharp_opencl.c:101:5: error: too few arguments to function 'av_mallocz_array' e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavutil/mem.h:228:41: note: declared here e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavfilter/unsharp_opencl.c: In function 'generate_mask': e:/ffmpeg/ffmpeg_submit/ffmpeg_updata/ffmpeg_unsharp/libavfilter/unsharp_opencl.c:130:5: warning: ISO C90 forbids mixed declarations and code [-Wdeclaration-after-stateme make: *** [libavfilter/unsharp_opencl.o] Error 1 Thanks Best regards > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel