2015-01-04 7:34 GMT+08:00 Alexey Titov <alex.ti....@gmail.com>: > From: atitov <alexey.ti...@amd.com> > > --- > libavfilter/unsharp.h | 4 ++ > libavfilter/unsharp_opencl.c | 74 +++++++++++++++------- > libavfilter/unsharp_opencl_kernel.h | 122 > ++++++++++++++++++++++++++---------- > 3 files changed, 145 insertions(+), 55 deletions(-) > > diff --git a/libavfilter/unsharp.h b/libavfilter/unsharp.h > index c2aed64..fc651c0 100644 > --- a/libavfilter/unsharp.h > +++ b/libavfilter/unsharp.h > @@ -41,6 +41,10 @@ typedef struct { > cl_kernel kernel_chroma; > cl_mem cl_luma_mask; > cl_mem cl_chroma_mask; > + cl_mem cl_luma_mask_x; > + cl_mem cl_chroma_mask_x; > + cl_mem cl_luma_mask_y; > + cl_mem cl_chroma_mask_y; > int in_plane_size[8]; > int out_plane_size[8]; > int plane_num; > diff --git a/libavfilter/unsharp_opencl.c b/libavfilter/unsharp_opencl.c > index 5c6b5ef..1923cb3 100644 > --- a/libavfilter/unsharp_opencl.c > +++ b/libavfilter/unsharp_opencl.c > @@ -87,11 +87,12 @@ end: > return ret; > } > > -static int compute_mask_matrix(cl_mem cl_mask_matrix, int step_x, int > step_y) > +static int copy_separable_masks(cl_mem cl_mask_x, cl_mem cl_mask_y, int > step_x, int step_y) > { > - int i, j, ret = 0; > - uint32_t *mask_matrix, *mask_x, *mask_y; > - size_t size_matrix = sizeof(uint32_t) * (2 * step_x + 1) * (2 * > step_y + 1); > + int ret = 0; > + uint32_t *mask_x, *mask_y; > + size_t size_mask_x = sizeof(uint32_t) * (2 * step_x + 1); > + size_t size_mask_y = sizeof(uint32_t) * (2 * step_y + 1); > mask_x = av_mallocz_array(2 * step_x + 1, sizeof(uint32_t)); > if (!mask_x) { > ret = AVERROR(ENOMEM); > @@ -102,37 +103,33 @@ static int compute_mask_matrix(cl_mem > cl_mask_matrix, int step_x, int step_y) > ret = AVERROR(ENOMEM); > goto end; > } > - mask_matrix = av_mallocz(size_matrix); > - if (!mask_matrix) { > - ret = AVERROR(ENOMEM); > - goto end; > - } > ret = compute_mask(step_x, mask_x); > if (ret < 0) > goto end; > ret = compute_mask(step_y, mask_y); > if (ret < 0) > goto end; > - for (j = 0; j < 2 * step_y + 1; j++) { > - for (i = 0; i < 2 * step_x + 1; i++) { > - mask_matrix[i + j * (2 * step_x + 1)] = mask_y[j] * mask_x[i]; > - } > - } > - ret = av_opencl_buffer_write(cl_mask_matrix, (uint8_t *)mask_matrix, > size_matrix); > + ret = av_opencl_buffer_write(cl_mask_x, (uint8_t *)mask_x, > size_mask_x); > + ret = av_opencl_buffer_write(cl_mask_y, (uint8_t *)mask_y, > size_mask_y); > end: > av_freep(&mask_x); > av_freep(&mask_y); > - av_freep(&mask_matrix); > return ret; > } > > static int generate_mask(AVFilterContext *ctx) > { > - UnsharpContext *unsharp = ctx->priv; > - int i, ret = 0, step_x[2], step_y[2]; > + cl_mem masks[4]; > cl_mem mask_matrix[2]; > + int i, ret = 0, step_x[2], step_y[2]; > + > + UnsharpContext *unsharp = ctx->priv; > mask_matrix[0] = unsharp->opencl_ctx.cl_luma_mask; > mask_matrix[1] = unsharp->opencl_ctx.cl_chroma_mask; > + masks[0] = unsharp->opencl_ctx.cl_luma_mask_x; > + masks[1] = unsharp->opencl_ctx.cl_luma_mask_y; > + masks[2] = unsharp->opencl_ctx.cl_chroma_mask_x; > + masks[3] = unsharp->opencl_ctx.cl_chroma_mask_y; > step_x[0] = unsharp->luma.steps_x; > step_x[1] = unsharp->chroma.steps_x; > step_y[0] = unsharp->luma.steps_y; > @@ -144,12 +141,16 @@ static int generate_mask(AVFilterContext *ctx) > else > unsharp->opencl_ctx.use_fast_kernels = 1; > > + if (!masks[0] || !masks[1] || !masks[2] || !masks[3]) { > + av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not > be NULL\n"); > + return AVERROR(EINVAL); > + } > if (!mask_matrix[0] || !mask_matrix[1]) { > av_log(ctx, AV_LOG_ERROR, "Luma mask and chroma mask should not > be NULL\n"); > return AVERROR(EINVAL); > } > for (i = 0; i < 2; i++) { > - ret = compute_mask_matrix(mask_matrix[i], step_x[i], step_y[i]); > + ret = copy_separable_masks(masks[2*i], masks[2*i+1], step_x[i], > step_y[i]); > if (ret < 0) > return ret; > } > @@ -184,7 +185,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > ret = avpriv_opencl_set_parameter(&kernel1, > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), > - > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_x), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_luma_mask_y), > > FF_OPENCL_PARAM_INFO(unsharp->luma.amount), > > FF_OPENCL_PARAM_INFO(unsharp->luma.scalebits), > > FF_OPENCL_PARAM_INFO(unsharp->luma.halfscale), > @@ -201,7 +203,8 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > ret = avpriv_opencl_set_parameter(&kernel2, > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_inbuf), > > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_outbuf), > - > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_x), > + > FF_OPENCL_PARAM_INFO(unsharp->opencl_ctx.cl_chroma_mask_y), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.amount), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.scalebits), > > FF_OPENCL_PARAM_INFO(unsharp->chroma.halfscale), > @@ -264,7 +267,9 @@ int ff_opencl_apply_unsharp(AVFilterContext *ctx, > AVFrame *in, AVFrame *out) > return AVERROR_EXTERNAL; > } > } > - clFinish(unsharp->opencl_ctx.command_queue); > + //blocking map is suffficient, no need for clFinish > + //clFinish(unsharp->opencl_ctx.command_queue); > + > return av_opencl_buffer_read_image(out->data, > unsharp->opencl_ctx.out_plane_size, > unsharp->opencl_ctx.plane_num, > unsharp->opencl_ctx.cl_outbuf, > > unsharp->opencl_ctx.cl_outbuf_size); > @@ -286,6 +291,27 @@ int ff_opencl_unsharp_init(AVFilterContext *ctx) > ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask, > sizeof(uint32_t) * (2 * > unsharp->chroma.steps_x + 1) * (2 * unsharp->chroma.steps_y + 1), > CL_MEM_READ_ONLY, NULL); > + // separable filters > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_x, > + sizeof(uint32_t) * (2 * > unsharp->luma.steps_x + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_luma_mask_y, > + sizeof(uint32_t) * (2 * > unsharp->luma.steps_y + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_x, > + sizeof(uint32_t) * (2 * > unsharp->chroma.steps_x + 1), > + CL_MEM_READ_ONLY, NULL); > + if (ret < 0) > + return ret; > + ret = av_opencl_buffer_create(&unsharp->opencl_ctx.cl_chroma_mask_y, > + sizeof(uint32_t) * (2 * > unsharp->chroma.steps_y + 1), > + CL_MEM_READ_ONLY, NULL); > if (ret < 0) > return ret; > ret = generate_mask(ctx); > @@ -339,6 +365,10 @@ void ff_opencl_unsharp_uninit(AVFilterContext *ctx) > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_outbuf); > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask); > av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_x); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_x); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_luma_mask_y); > + av_opencl_buffer_release(&unsharp->opencl_ctx.cl_chroma_mask_y); > clReleaseKernel(unsharp->opencl_ctx.kernel_default); > clReleaseKernel(unsharp->opencl_ctx.kernel_luma); > clReleaseKernel(unsharp->opencl_ctx.kernel_chroma); > diff --git a/libavfilter/unsharp_opencl_kernel.h > b/libavfilter/unsharp_opencl_kernel.h > index 9c4fd65..0fc802e 100644 > --- a/libavfilter/unsharp_opencl_kernel.h > +++ b/libavfilter/unsharp_opencl_kernel.h > @@ -36,7 +36,8 @@ inline unsigned char clip_uint8(int a) > kernel void unsharp_luma( > global unsigned char *src, > global unsigned char *dst, > - global int *mask, > + global int *mask_x, > + global int *mask_y, > int amount, > int scalebits, > int halfscale, > @@ -59,10 +60,12 @@ kernel void unsharp_luma( > return; > } > > - local uchar l[32][32]; > - local int lc[LU_RADIUS_X*LU_RADIUS_Y]; > + local short l[32][32]; > + local int lcx[LU_RADIUS_X]; > + local int lcy[LU_RADIUS_Y]; > int indexIx, indexIy, i, j; > > + //load up tile: actual workspace + halo of 8 points in x and y \n > for(i = 0; i <= 1; i++) { > indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; > indexIy = indexIy < 0 ? 0 : indexIy; > @@ -76,27 +79,54 @@ kernel void unsharp_luma( > } > > int indexL = threadIdx.y*16 + threadIdx.x; > - if (indexL < LU_RADIUS_X*LU_RADIUS_Y) > - lc[indexL] = mask[indexL]; > + if (indexL < LU_RADIUS_X) > + lcx[indexL] = mask_x[indexL]; > + if (indexL < LU_RADIUS_Y) > + lcy[indexL] = mask_y[indexL]; > barrier(CLK_LOCAL_MEM_FENCE); > > + //needed for unsharp mask application in the end \n > + int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > + > int idx, idy, maskIndex; > - int sum = 0; > - int steps_x = LU_RADIUS_X/2; > - int steps_y = LU_RADIUS_Y/2; > + int temp[2] = {0}; > + int steps_x = (LU_RADIUS_X-1)/2; > + int steps_y = (LU_RADIUS_Y-1)/2; > > - \n#pragma unroll\n > - for (i = -steps_y; i <= steps_y; i++) { > - idy = 8 + i + threadIdx.y; > - \n#pragma unroll\n > - for (j = -steps_x; j <= steps_x; j++) { > - idx = 8 + j + threadIdx.x; > - maskIndex = (i + steps_y)*LU_RADIUS_X + j + steps_x; > - sum += (int)l[idy][idx] * lc[maskIndex]; > + // compute the actual workspace + left&right halos \n > + \n#pragma unroll\n > + for (j = 0; j <=1; j++) { > + //extra work to cover left and right halos \n > + idx = 16*j + threadIdx.x; > + \n#pragma unroll\n > + for (i = -steps_y; i <= steps_y; i++) { > + idy = 8 + i + threadIdx.y; > + maskIndex = (i + steps_y); > + temp[j] += (int)l[idy][idx] * lcy[maskIndex]; > } > } > - int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > - int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * > amount) >> 16); > + barrier(CLK_LOCAL_MEM_FENCE); > + //save results from the vertical filter in local memory \n > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = 0; j <=1; j++) { > + idx = 16*j + threadIdx.x; > + l[idy][idx] = temp[j]; > + } > + barrier(CLK_LOCAL_MEM_FENCE); > + > + //compute results with the horizontal filter \n > + int sum = 0; > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = -steps_x; j <= steps_x; j++) { > + idx = 8 + j + threadIdx.x; > + maskIndex = j + steps_x; > + sum += (int)l[idy][idx] * lcx[maskIndex]; > + } > + > + int res = orig_value + (((orig_value - (int)((sum + halfscale) >> > scalebits)) * amount) >> 16); > + > if (globalIdx.x < width && globalIdx.y < height) > dst[globalIdx.x + globalIdx.y*dst_stride] = clip_uint8(res); > } > @@ -104,7 +134,8 @@ kernel void unsharp_luma( > kernel void unsharp_chroma( > global unsigned char *src_y, > global unsigned char *dst_y, > - global int *mask, > + global int *mask_x, > + global int *mask_y, > int amount, > int scalebits, > int halfscale, > @@ -141,8 +172,9 @@ kernel void unsharp_chroma( > return; > } > > - local uchar l[32][32]; > - local int lc[CH_RADIUS_X*CH_RADIUS_Y]; > + local ushort l[32][32]; > + local int lcx[CH_RADIUS_X]; > + local int lcy[CH_RADIUS_Y]; > int indexIx, indexIy, i, j; > for(i = 0; i <= 1; i++) { > indexIy = -8 + (blockIdx.y + i) * 16 + threadIdx.y; > @@ -157,27 +189,51 @@ kernel void unsharp_chroma( > } > > int indexL = threadIdx.y*16 + threadIdx.x; > - if (indexL < CH_RADIUS_X*CH_RADIUS_Y) > - lc[indexL] = mask[indexL]; > + if (indexL < CH_RADIUS_X) > + lcx[indexL] = mask_x[indexL]; > + if (indexL < CH_RADIUS_Y) > + lcy[indexL] = mask_y[indexL]; > barrier(CLK_LOCAL_MEM_FENCE); > > + int orig_value = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > + > int idx, idy, maskIndex; > - int sum = 0; > int steps_x = CH_RADIUS_X/2; > int steps_y = CH_RADIUS_Y/2; > + int temp[2] = {0,0}; > > \n#pragma unroll\n > - for (i = -steps_y; i <= steps_y; i++) { > - idy = 8 + i + threadIdx.y; > + for (j = 0; j <= 1; j++) { > + idx = 16*j + threadIdx.x; > \n#pragma unroll\n > - for (j = -steps_x; j <= steps_x; j++) { > - idx = 8 + j + threadIdx.x; > - maskIndex = (i + steps_y)*CH_RADIUS_X + j + steps_x; > - sum += (int)l[idy][idx] * lc[maskIndex]; > - } > + for (i = -steps_y; i <= steps_y; i++) { > + idy = 8 + i + threadIdx.y; > + maskIndex = i + steps_y; > + temp[j] += (int)l[idy][idx] * lcy[maskIndex]; > + } > + } > + > + barrier(CLK_LOCAL_MEM_FENCE); > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = 0; j <= 1; j++) { > + idx = 16*j + threadIdx.x; > + l[idy][idx] = temp[j]; > } > - int temp = (int)l[threadIdx.y + 8][threadIdx.x + 8]; > - int res = temp + (((temp - (int)((sum + halfscale) >> scalebits)) * > amount) >> 16); > + barrier(CLK_LOCAL_MEM_FENCE); > + > + //compute results with the horizontal filter \n > + int sum = 0; > + idy = 8 + threadIdx.y; > + \n#pragma unroll\n > + for (j = -steps_x; j <= steps_x; j++) { > + idx = 8 + j + threadIdx.x; > + maskIndex = j + steps_x; > + sum += (int)l[idy][idx] * lcx[maskIndex]; > + } > + > + int res = orig_value + (((orig_value - (int)((sum + halfscale) >> > scalebits)) * amount) >> 16); > + > if (globalIdx.x < cw && globalIdx.y < ch) > dst[globalIdx.x + globalIdx.y*dst_stride_ch] = clip_uint8(res); > } > -- > 1.8.4.msysgit.0 > > Hi This is the first part of the whole patch right? Could you send the second part?
Thanks Best regads > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel