On 29/04/2019 03:06, Song, Ruiling wrote:> > In order to verify the patch, I also have more testing on the CPU OpenCL > driver from Intel. > I make it run 100 times, and still not see any reported overflow. So I think > we can say the filter is in good quality to be merged. Any different idea?
I've tried a lot more times on some additional platforms (Skylake-GT3, Mali-G52) and I can't reproduce it on anything else. So, I think I agree that it must be a driver issue and shouldn't block anything. On 12/04/2019 16:09, Ruiling Song wrote: > Signed-off-by: Ruiling Song <ruiling.s...@intel.com> > --- > configure | 1 + > doc/filters.texi | 4 + > libavfilter/Makefile | 1 + > libavfilter/allfilters.c | 1 + > libavfilter/opencl/nlmeans.cl | 115 +++++++++ > libavfilter/opencl_source.h | 1 + > libavfilter/vf_nlmeans_opencl.c | 442 ++++++++++++++++++++++++++++++++ > 7 files changed, 565 insertions(+) > create mode 100644 libavfilter/opencl/nlmeans.cl > create mode 100644 libavfilter/vf_nlmeans_opencl.c > > ... > + > +static int nlmeans_plane(AVFilterContext *avctx, cl_mem dst, cl_mem src, > + cl_int width, cl_int height, cl_int p, cl_int r) > +{ > + NLMeansOpenCLContext *ctx = avctx->priv; > + const float zero = 0.0f; > + const size_t worksize1[] = {height}; > + const size_t worksize2[] = {width}; > + const size_t worksize3[2] = {width, height}; > + int dx, dy, err = 0, weight_buf_size; > + cl_int cle; > + int nb_pixel, *tmp, idx = 0; > + cl_int *dxdy; > + > + weight_buf_size = width * height * sizeof(float); > + cle = clEnqueueFillBuffer(ctx->command_queue, ctx->weight, > + &zero, sizeof(float), 0, weight_buf_size, > + 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill weight buffer: %d.\n", > + cle); > + cle = clEnqueueFillBuffer(ctx->command_queue, ctx->sum, > + &zero, sizeof(float), 0, weight_buf_size, > + 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to fill sum buffer: %d.\n", > + cle); > + > + nb_pixel = (2 * r + 1) * (2 * r + 1) - 1; > + dxdy = av_malloc(nb_pixel * 2 * sizeof(cl_int)); > + tmp = av_malloc(nb_pixel * 2 * sizeof(int)); > + > + if (!dxdy || !tmp) > + goto fail; > + > + for (dx = -r; dx <= r; dx++) { > + for (dy = -r; dy <= r; dy++) { > + if (dx || dy) { > + tmp[idx++] = dx; > + tmp[idx++] = dy; > + } > + } > + } > + // repack dx/dy seperately, as we want to do four pairs of dx/dy in a > batch > + for (int i = 0; i < nb_pixel / 4; i++) { > + dxdy[i * 8] = tmp[i * 8]; // dx0 > + dxdy[i * 8 + 1] = tmp[i * 8 + 2]; // dx1 > + dxdy[i * 8 + 2] = tmp[i * 8 + 4]; // dx2 > + dxdy[i * 8 + 3] = tmp[i * 8 + 6]; // dx3 > + dxdy[i * 8 + 4] = tmp[i * 8 + 1]; // dy0 > + dxdy[i * 8 + 5] = tmp[i * 8 + 3]; // dy1 > + dxdy[i * 8 + 6] = tmp[i * 8 + 5]; // dy2 > + dxdy[i * 8 + 7] = tmp[i * 8 + 7]; // dy3 > + } > + av_freep(&tmp); > + > + for (int i = 0; i < nb_pixel / 4; i++) { > + int *dx_cur = dxdy + 8 * i; > + int *dy_cur = dxdy + 8 * i + 4; cl_int. > + > + // horizontal pass > + // integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x] > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 0, cl_mem, &ctx->integral_img); > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 1, cl_mem, &src); > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 2, cl_int, &width); > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 3, cl_int, &height); > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 4, cl_int4, dx_cur); > + CL_SET_KERNEL_ARG(ctx->horiz_kernel, 5, cl_int4, dy_cur); > + cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->horiz_kernel, > 1, > + NULL, worksize1, NULL, 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue horiz_kernel: > %d.\n", > + cle); > + // vertical pass > + // integral(x, y) = sum(integral(x, v)) for v in [0, y] > + CL_SET_KERNEL_ARG(ctx->vert_kernel, 0, cl_mem, &ctx->integral_img); > + CL_SET_KERNEL_ARG(ctx->vert_kernel, 1, cl_mem, &ctx->overflow); > + CL_SET_KERNEL_ARG(ctx->vert_kernel, 2, cl_int, &width); > + CL_SET_KERNEL_ARG(ctx->vert_kernel, 3, cl_int, &height); > + cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->vert_kernel, > + 1, NULL, worksize2, NULL, 0, NULL, > NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue vert_kernel: > %d.\n", > + cle); > + > + // accumlate weights > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 0, cl_mem, &ctx->sum); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 1, cl_mem, &ctx->weight); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 2, cl_mem, &ctx->integral_img); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 3, cl_mem, &src); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 4, cl_int, &width); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 5, cl_int, &height); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 6, cl_int, &p); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 7, cl_float, &ctx->h); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 8, cl_int4, dx_cur); > + CL_SET_KERNEL_ARG(ctx->accum_kernel, 9, cl_int4, dy_cur); > + cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->accum_kernel, > + 2, NULL, worksize3, NULL, 0, NULL, > NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue kernel: %d.\n", > cle); > + } > + av_freep(&dxdy); > + > + // average > + CL_SET_KERNEL_ARG(ctx->average_kernel, 0, cl_mem, &dst); > + CL_SET_KERNEL_ARG(ctx->average_kernel, 1, cl_mem, &src); > + CL_SET_KERNEL_ARG(ctx->average_kernel, 2, cl_mem, &ctx->sum); > + CL_SET_KERNEL_ARG(ctx->average_kernel, 3, cl_mem, &ctx->weight); > + cle = clEnqueueNDRangeKernel(ctx->command_queue, ctx->average_kernel, 2, > + NULL, worksize3, NULL, 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to enqueue average kernel: %d.\n", > + cle); > + cle = clFlush(ctx->command_queue); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to flush command queue: %d.\n", > cle); > +fail: > + if (tmp) > + av_freep(&tmp); > + if (dxdy) > + av_freep(&dxdy); Funny indent. > + return err; > +} > + > +static int nlmeans_opencl_filter_frame(AVFilterLink *inlink, AVFrame *input) > +{ > + AVFilterContext *avctx = inlink->dst; > + AVFilterLink *outlink = avctx->outputs[0]; > + NLMeansOpenCLContext *ctx = avctx->priv; > + AVFrame *output = NULL; > + AVHWFramesContext *input_frames_ctx; > + const AVPixFmtDescriptor *desc; > + enum AVPixelFormat in_format; > + cl_mem src, dst; > + const cl_int zero = 0; > + int w, h, err, cle, overflow, p, patch, research; > + > + av_log(ctx, AV_LOG_DEBUG, "Filter input: %s, %ux%u (%"PRId64").\n", > + av_get_pix_fmt_name(input->format), > + input->width, input->height, input->pts); > + > + if (!input->hw_frames_ctx) > + return AVERROR(EINVAL); > + input_frames_ctx = (AVHWFramesContext*)input->hw_frames_ctx->data; > + in_format = input_frames_ctx->sw_format; > + > + output = ff_get_video_buffer(outlink, outlink->w, outlink->h); > + if (!output) { > + err = AVERROR(ENOMEM); > + goto fail; > + } > + > + err = av_frame_copy_props(output, input); > + if (err < 0) > + goto fail; > + > + if (!ctx->initialised) { > + desc = av_pix_fmt_desc_get(in_format); > + if (!is_format_supported(in_format)) { > + err = AVERROR(EINVAL); > + av_log(avctx, AV_LOG_ERROR, "input format %s not supported\n", > + av_get_pix_fmt_name(in_format)); > + goto fail; > + } > + ctx->chroma_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w); > + ctx->chroma_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h); > + > + err = nlmeans_opencl_init(avctx, inlink->w, inlink->h); > + if (err < 0) > + goto fail; > + } > + > + cle = clEnqueueWriteBuffer(ctx->command_queue, ctx->overflow, CL_FALSE, > + 0, sizeof(cl_int), &zero, 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to initialize overflow" > + "detection buffer %d.\n", cle); > + > + for (p = 0; p < FF_ARRAY_ELEMS(output->data); p++) { > + src = (cl_mem) input->data[p]; > + dst = (cl_mem) output->data[p]; > + > + if (!dst) > + break; I think I'd assert that src is not null here as well, just in case. > + w = p ? ctx->chroma_w : inlink->w; > + h = p ? ctx->chroma_h : inlink->h; > + patch = (p ? ctx->patch_size_uv : ctx->patch_size) / 2; > + research = (p ? ctx->research_size_uv : ctx->research_size) / 2; Is this intended for the GBRP case? Intuitively I would expect it to treat each of GBR the same, but maybe it's preferable for green to be special somehow. > + err = nlmeans_plane(avctx, dst, src, w, h, patch, research); > + if (err < 0) > + goto fail; > + } > + // overflow occured? > + cle = clEnqueueReadBuffer(ctx->command_queue, ctx->overflow, CL_FALSE, > + 0, sizeof(cl_int), &overflow, 0, NULL, NULL); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read overflow: %d.\n", cle); > + > + cle = clFinish(ctx->command_queue); > + CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to finish kernel: %d.\n", cle); > + > + if (overflow > 0) > + av_log(avctx, AV_LOG_ERROR, "integral image overflow %d\n", overflow); > + > + av_frame_free(&input); > + > + av_log(ctx, AV_LOG_DEBUG, "Filter output: %s, %ux%u (%"PRId64").\n", > + av_get_pix_fmt_name(output->format), > + output->width, output->height, output->pts); > + > + return ff_filter_frame(outlink, output); > + > +fail: > + clFinish(ctx->command_queue); > + av_frame_free(&input); > + av_frame_free(&output); > + return err; > +} > ... Thanks, - Mark _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".