On Sun, Sep 26, 2021 at 8:11 PM Paul B Mahol <one...@gmail.com> wrote: > > On Sun, Sep 26, 2021 at 1:27 PM myp...@gmail.com <myp...@gmail.com> wrote: > > > On Sun, Sep 26, 2021 at 4:11 PM Paul B Mahol <one...@gmail.com> wrote: > > > > > > Signed-off-by: Paul B Mahol <one...@gmail.com> > > > --- > > > libavfilter/vf_avgblur.c | 311 ++++++++++++++------------ > > > tests/ref/fate/filter-refcmp-psnr-yuv | 80 +++---- > > > 2 files changed, 211 insertions(+), 180 deletions(-) > > > > > > diff --git a/libavfilter/vf_avgblur.c b/libavfilter/vf_avgblur.c > > > index 3e222a43fa..a838285bb4 100644 > > > --- a/libavfilter/vf_avgblur.c > > > +++ b/libavfilter/vf_avgblur.c > > > @@ -20,6 +20,7 @@ > > > * SOFTWARE. > > > */ > > > > > > +#include "libavutil/avassert.h" > > > #include "libavutil/imgutils.h" > > > #include "libavutil/opt.h" > > > #include "libavutil/pixdesc.h" > > > @@ -36,13 +37,15 @@ typedef struct AverageBlurContext { > > > int planes; > > > > > > int depth; > > > + int max; > > > + int area; > > > int planewidth[4]; > > > int planeheight[4]; > > > - float *buffer; > > > + void *buffer; > > > + uint16_t lut[256 * 256 * 256]; > > > int nb_planes; > > > > > > - int (*filter_horizontally)(AVFilterContext *ctx, void *arg, int > > jobnr, int nb_jobs); > > > - int (*filter_vertically)(AVFilterContext *ctx, void *arg, int > > jobnr, int nb_jobs); > > > + int (*filter[2])(AVFilterContext *ctx, void *arg, int jobnr, int > > nb_jobs); > > > } AverageBlurContext; > > > > > > #define OFFSET(x) offsetof(AverageBlurContext, x) > > > @@ -60,124 +63,138 @@ AVFILTER_DEFINE_CLASS(avgblur); > > > typedef struct ThreadData { > > > int height; > > > int width; > > > - uint8_t *ptr; > > > - int linesize; > > > + const void *ptr; > > > + void *dptr; > > > + int linesize, dlinesize; > > > } ThreadData; > > > > > > -#define HORIZONTAL_FILTER(name, type) > > \ > > > -static int filter_horizontally_##name(AVFilterContext *ctx, void *arg, > > int jobnr, int nb_jobs)\ > > > -{ > > \ > > > - AverageBlurContext *s = ctx->priv; > > \ > > > - ThreadData *td = arg; > > \ > > > - const int height = td->height; > > \ > > > - const int width = td->width; > > \ > > > - const int slice_start = (height * jobnr ) / nb_jobs; > > \ > > > - const int slice_end = (height * (jobnr+1)) / nb_jobs; > > \ > > > - const int radius = FFMIN(s->radius, width / 2); > > \ > > > - const int linesize = td->linesize / sizeof(type); > > \ > > > - float *buffer = s->buffer; > > \ > > > - const type *src; > > \ > > > - float *ptr; > > \ > > > - int y, x; > > \ > > > - > > \ > > > - /* Filter horizontally along each row */ > > \ > > > - for (y = slice_start; y < slice_end; y++) { > > \ > > > - float acc = 0; > > \ > > > - int count = 0; > > \ > > > - > > \ > > > - src = (const type *)td->ptr + linesize * y; > > \ > > > - ptr = buffer + width * y; > > \ > > > - > > \ > > > - for (x = 0; x < radius; x++) { > > \ > > > - acc += src[x]; > > \ > > > - } > > \ > > > - count += radius; > > \ > > > - > > \ > > > - for (x = 0; x <= radius; x++) { > > \ > > > - acc += src[x + radius]; > > \ > > > - count++; > > \ > > > - ptr[x] = acc / count; > > \ > > > - } > > \ > > > - > > \ > > > - for (; x < width - radius; x++) { > > \ > > > - acc += src[x + radius] - src[x - radius - 1]; > > \ > > > - ptr[x] = acc / count; > > \ > > > - } > > \ > > > - > > \ > > > - for (; x < width; x++) { > > \ > > > - acc -= src[x - radius]; > > \ > > > - count--; > > \ > > > - ptr[x] = acc / count; > > \ > > > - } > > \ > > > - } > > \ > > > - > > \ > > > - return 0; > > \ > > > +#define LUT_DIV(sum, area) (lut[(sum)]) > > > +#define SLOW_DIV(sum, area) ((sum) / (area)) > > > + > > > +#define FILTER(name, type, btype, lutunused, areaunused, lutdiv) > > \ > > > +static int filter_##name(AVFilterContext *ctx, void *arg, int jobnr, > > int nb_jobs) \ > > > +{ > > \ > > > + AverageBlurContext *s = ctx->priv; > > \ > > > + ThreadData *td = arg; > > \ > > > + areaunused const int area = s->area; > > \ > > > + lutunused const uint16_t *lut = s->lut; > > \ > > > + const int size_w = s->radius; > > \ > > > + const int size_h = s->radiusV; > > \ > > > + btype *col_sum = (btype *)s->buffer + size_w; > > \ > > > + const int dlinesize = td->dlinesize / sizeof(type); > > \ > > > + const int linesize = td->linesize / sizeof(type); > > \ > > > + const int height = td->height; > > \ > > > + const int width = td->width; > > \ > > > + const type *src = td->ptr; > > \ > > > + type *dst = td->dptr; > > \ > > > + btype sum = 0; > > \ > > > + > > \ > > > + for (int x = -size_w; x < 0; x++) { > > \ > > > + sum = src[0] * size_h; > > \ > > > + for (int y = 0; y <= size_h; y++) > > \ > > > + sum += src[y * linesize]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + col_sum[x] = sum; > > \ > > > + } > > \ > > > + > > \ > > > + for (int x = 0; x < width; x++) { > > \ > > > + sum = src[x] * size_h; > > \ > > > + for (int y = 0; y <= size_h; y++) > > \ > > > + sum += src[x + y * linesize]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + col_sum[x] = sum; > > \ > > > + } > > \ > > > + > > \ > > > + for (int x = width; x < width + size_w; x++) { > > \ > > > + sum = src[width - 1] * size_h; > > \ > > > + for (int y = 0; y <= size_h; y++) > > \ > > > + sum += src[width - 1 + y * linesize]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + col_sum[x] = sum; > > \ > > > + } > > \ > > > + > > \ > > > + sum = 0; > > \ > > > + for (int x = -size_w; x <= size_w; x++) > > \ > > > + sum += col_sum[x]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + dst[0] = lutdiv(sum, area); > > \ > > > + > > \ > > > + for (int x = 1; x < width; x++) { > > \ > > > + sum = sum - col_sum[x - size_w - 1] + col_sum[x + size_w]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + dst[x] = lutdiv(sum, area); > > \ > > > + } > > \ > > > + > > \ > > > + src = td->ptr; > > \ > > > + src += linesize; > > \ > > > + dst += dlinesize; > > \ > > > + > > \ > > > + for (int y = 1; y < height; y++) { > > \ > > > + const int syp = FFMIN(size_h, height - y - 1) * linesize; > > \ > > > + const int syn = FFMIN(y, size_h + 1) * linesize; > > \ > > > + > > \ > > > + sum = 0; > > \ > > > + > > \ > > > + for (int x = -size_w; x < 0; x++) > > \ > > > + col_sum[x] += src[0 + syp] - src[0 - syn]; > > \ > > > + > > \ > > > + for (int x = 0; x < width; x++) > > \ > > > + col_sum[x] += src[x + syp] - src[x - syn]; > > \ > > > + > > \ > > > + for (int x = width; x < width + size_w; x++) > > \ > > > + col_sum[x] += src[width - 1 + syp] - src[width - 1 - syn]; > > \ > > > + > > \ > > > + for (int x = -size_w; x <= size_w; x++) > > \ > > > + sum += col_sum[x]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + dst[0] = lutdiv(sum, area); > > \ > > > + > > \ > > > + for (int x = 1; x < width; x++) { > > \ > > > + sum = sum - col_sum[x - size_w - 1] + col_sum[x + size_w]; > > \ > > > + av_assert2(sum >= 0); > > \ > > > + dst[x] = lutdiv(sum, area); > > \ > > > + } > > \ > > > + > > \ > > > + src += linesize; > > \ > > > + dst += dlinesize; > > \ > > > + } > > \ > > > + > > \ > > > + return 0; > > \ > > > } > > > > > > -HORIZONTAL_FILTER(8, uint8_t) > > > -HORIZONTAL_FILTER(16, uint16_t) > > > - > > > -#define VERTICAL_FILTER(name, type) > > \ > > > -static int filter_vertically_##name(AVFilterContext *ctx, void *arg, > > int jobnr, int nb_jobs) \ > > > -{ > > \ > > > - AverageBlurContext *s = ctx->priv; > > \ > > > - ThreadData *td = arg; > > \ > > > - const int height = td->height; > > \ > > > - const int width = td->width; > > \ > > > - const int slice_start = (width * jobnr ) / nb_jobs; > > \ > > > - const int slice_end = (width * (jobnr+1)) / nb_jobs; > > \ > > > - const int radius = FFMIN(s->radiusV, height / 2); > > \ > > > - const int linesize = td->linesize / sizeof(type); > > \ > > > - type *buffer = (type *)td->ptr; > > \ > > > - const float *src; > > \ > > > - type *ptr; > > \ > > > - int i, x; > > \ > > > - > > \ > > > - /* Filter vertically along each column */ > > \ > > > - for (x = slice_start; x < slice_end; x++) { > > \ > > > - float acc = 0; > > \ > > > - int count = 0; > > \ > > > - > > \ > > > - src = s->buffer + x; > > \ > > > - > > \ > > > - for (i = 0; i < radius; i++) { > > \ > > > - acc += src[0]; > > \ > > > - src += width; > > \ > > > - } > > \ > > > - count += radius; > > \ > > > - > > \ > > > - src = s->buffer + x; > > \ > > > - ptr = buffer + x; > > \ > > > - for (i = 0; i + radius < height && i <= radius; i++) { > > \ > > > - acc += src[(i + radius) * width]; > > \ > > > - count++; > > \ > > > - ptr[i * linesize] = acc / count; > > \ > > > - } > > \ > > > - > > \ > > > - for (; i < height - radius; i++) { > > \ > > > - acc += src[(i + radius) * width] - src[(i - radius - 1) * > > width]; \ > > > - ptr[i * linesize] = acc / count; > > \ > > > - } > > \ > > > - > > \ > > > - for (; i < height; i++) { > > \ > > > - acc -= src[(i - radius) * width]; > > \ > > > - count--; > > \ > > > - ptr[i * linesize] = acc / count; > > \ > > > - } > > \ > > > - } > > \ > > > - > > \ > > > - return 0; > > \ > > > -} > > > +FILTER(lut8, uint8_t, int32_t, , av_unused, LUT_DIV) > > > +FILTER(lut16, uint16_t, int64_t, , av_unused, LUT_DIV) > > > + > > > +FILTER(slow8, uint8_t, int32_t, av_unused, , SLOW_DIV) > > > +FILTER(slow16, uint16_t, int64_t, av_unused, , SLOW_DIV) > > > + > > > +static void build_lut(AVFilterContext *ctx, int max) > > > +{ > > > + AverageBlurContext *s = ctx->priv; > > > + const int area = (2 * s->radiusV + 1) * (2 * s->radius + 1); > > > + > > > + s->area = area; > > > + if (max * area >= FF_ARRAY_ELEMS(s->lut)) > > > + return; > > > + > > > + for (int i = 0, j = 0, k = 0; i < max * area; i++, j++) { > > > + if (j == area) { > > > + k++; > > > + j = 0; > > > + } > > > > > > -VERTICAL_FILTER(8, uint8_t) > > > -VERTICAL_FILTER(16, uint16_t) > > > + s->lut[i] = k; > > > + } > > > +} > > > > > > static int config_input(AVFilterLink *inlink) > > > { > > > + AVFilterContext *ctx = inlink->dst; > > > const AVPixFmtDescriptor *desc = > > av_pix_fmt_desc_get(inlink->format); > > > - AverageBlurContext *s = inlink->dst->priv; > > > + AverageBlurContext *s = ctx->priv; > > > > > > s->depth = desc->comp[0].depth; > > > + s->max = 1 << s->depth; > > > s->planewidth[1] = s->planewidth[2] = AV_CEIL_RSHIFT(inlink->w, > > desc->log2_chroma_w); > > > s->planewidth[0] = s->planewidth[3] = inlink->w; > > > s->planeheight[1] = s->planeheight[2] = AV_CEIL_RSHIFT(inlink->h, > > desc->log2_chroma_h); > > > @@ -185,21 +202,20 @@ static int config_input(AVFilterLink *inlink) > > > > > > s->nb_planes = av_pix_fmt_count_planes(inlink->format); > > > > > > - s->buffer = av_malloc_array(inlink->w, inlink->h * > > sizeof(*s->buffer)); > > > + s->buffer = av_calloc(inlink->w + (1024 * 2 + 1), 4 * ((s->depth + > > 7) / 8)); > > > if (!s->buffer) > > > return AVERROR(ENOMEM); > > > > > > - if (s->radiusV <= 0) { > > > + if (s->radiusV <= 0) > > > s->radiusV = s->radius; > > > - } > > > > > > - if (s->depth == 8) { > > > - s->filter_horizontally = filter_horizontally_8; > > > - s->filter_vertically = filter_vertically_8; > > > - } else { > > > - s->filter_horizontally = filter_horizontally_16; > > > - s->filter_vertically = filter_vertically_16; > > > - } > > > + s->filter[0] = s->depth <= 8 ? filter_lut8 : filter_lut16; > > > + s->filter[1] = s->depth <= 8 ? filter_slow8 : filter_slow16; > > > + > > > + s->radius = FFMIN(s->planewidth[1] / 2, s->radius); > > > + s->radiusV = FFMIN(s->planeheight[1] / 2, s->radiusV); > > > + > > > + build_lut(ctx, s->max); > > > > > > return 0; > > > } > > > @@ -209,19 +225,16 @@ static void averageiir2d(AVFilterContext *ctx, > > AVFrame *in, AVFrame *out, int pl > > > AverageBlurContext *s = ctx->priv; > > > const int width = s->planewidth[plane]; > > > const int height = s->planeheight[plane]; > > > - const int nb_threads = ff_filter_get_nb_threads(ctx); > > > + const int slow = (s->max * s->area) >= FF_ARRAY_ELEMS(s->lut); > > > ThreadData td; > > > > > > td.width = width; > > > td.height = height; > > > td.ptr = in->data[plane]; > > > td.linesize = in->linesize[plane]; > > > - ff_filter_execute(ctx, s->filter_horizontally, &td, > > > - NULL, FFMIN(height, nb_threads)); > > > - td.ptr = out->data[plane]; > > > - td.linesize = out->linesize[plane]; > > > - ff_filter_execute(ctx, s->filter_vertically, &td, > > > - NULL, FFMIN(width, nb_threads)); > > > + td.dptr = out->data[plane]; > > > + td.dlinesize = out->linesize[plane]; > > > + s->filter[slow](ctx, &td, 0, 0); > > > } > > > > > > static int query_formats(AVFilterContext *ctx) > > > @@ -259,16 +272,12 @@ static int filter_frame(AVFilterLink *inlink, > > AVFrame *in) > > > AVFrame *out; > > > int plane; > > > > > > - if (av_frame_is_writable(in)) { > > > - out = in; > > > - } else { > > > - out = ff_get_video_buffer(outlink, outlink->w, outlink->h); > > > - if (!out) { > > > - av_frame_free(&in); > > > - return AVERROR(ENOMEM); > > > - } > > > - av_frame_copy_props(out, in); > > > + out = ff_get_video_buffer(outlink, outlink->w, outlink->h); > > > + if (!out) { > > > + av_frame_free(&in); > > > + return AVERROR(ENOMEM); > > > } > > > + av_frame_copy_props(out, in); > > > > > > for (plane = 0; plane < s->nb_planes; plane++) { > > > const int height = s->planeheight[plane]; > > > @@ -285,11 +294,33 @@ static int filter_frame(AVFilterLink *inlink, > > AVFrame *in) > > > averageiir2d(ctx, in, out, plane); > > > } > > > > > > - if (out != in) > > > - av_frame_free(&in); > > > + av_frame_free(&in); > > > return ff_filter_frame(outlink, out); > > > } > > > > > > +static int process_command(AVFilterContext *ctx, const char *cmd, const > > char *args, > > > + char *res, int res_len, int flags) > > > +{ > > > + AverageBlurContext *s = ctx->priv; > > > + const int area = s->area; > > > + int ret; > > > + > > > + ret = ff_filter_process_command(ctx, cmd, args, res, res_len, > > flags); > > > + if (ret < 0) > > > + return ret; > > > + > > > + if (s->radiusV <= 0) > > > + s->radiusV = s->radius; > > > + > > > + s->radius = FFMIN(s->planewidth[1] / 2, s->radius); > > > + s->radiusV = FFMIN(s->planeheight[1] / 2, s->radiusV); > > > + > > > + if (area != (2 * s->radiusV + 1) * (2 * s->radius + 1)) > > > + build_lut(ctx, s->max); > > > + > > > + return 0; > > > +} > > > + > > > static av_cold void uninit(AVFilterContext *ctx) > > > { > > > AverageBlurContext *s = ctx->priv; > > > @@ -322,6 +353,6 @@ const AVFilter ff_vf_avgblur = { > > > .query_formats = query_formats, > > > FILTER_INPUTS(avgblur_inputs), > > > FILTER_OUTPUTS(avgblur_outputs), > > > - .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC | > > AVFILTER_FLAG_SLICE_THREADS, > > > - .process_command = ff_filter_process_command, > > > + .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, > > > + .process_command = process_command, > > > }; > > > diff --git a/tests/ref/fate/filter-refcmp-psnr-yuv > > b/tests/ref/fate/filter-refcmp-psnr-yuv > > > index 0e634ed0e4..196d3da74e 100644 > > > --- a/tests/ref/fate/filter-refcmp-psnr-yuv > > > +++ b/tests/ref/fate/filter-refcmp-psnr-yuv > > > @@ -1,45 +1,45 @@ > > > frame:0 pts:0 pts_time:0 > > > -lavfi.psnr.mse.y=222.06 > > > -lavfi.psnr.psnr.y=24.67 > > > -lavfi.psnr.mse.u=339.38 > > > -lavfi.psnr.psnr.u=22.82 > > > -lavfi.psnr.mse.v=705.41 > > > -lavfi.psnr.psnr.v=19.65 > > > -lavfi.psnr.mse_avg=372.23 > > > -lavfi.psnr.psnr_avg=22.42 > > > +lavfi.psnr.mse.y=218.435333 > > > +lavfi.psnr.psnr.y=24.737576 > > > +lavfi.psnr.mse.u=336.693390 > > > +lavfi.psnr.psnr.u=22.858458 > > > +lavfi.psnr.mse.v=698.968384 > > > +lavfi.psnr.psnr.v=19.686228 > > > +lavfi.psnr.mse_avg=368.133118 > > > +lavfi.psnr.psnr_avg=22.470755 > > > frame:1 pts:1 pts_time:1 > > > -lavfi.psnr.mse.y=236.74 > > > -lavfi.psnr.psnr.y=24.39 > > > -lavfi.psnr.mse.u=416.17 > > > -lavfi.psnr.psnr.u=21.94 > > > -lavfi.psnr.mse.v=704.98 > > > -lavfi.psnr.psnr.v=19.65 > > > -lavfi.psnr.mse_avg=398.66 > > > -lavfi.psnr.psnr_avg=22.12 > > > +lavfi.psnr.mse.y=232.656189 > > > +lavfi.psnr.psnr.y=24.463657 > > > +lavfi.psnr.mse.u=413.841064 > > > +lavfi.psnr.psnr.u=21.962467 > > > +lavfi.psnr.mse.v=693.103577 > > > +lavfi.psnr.psnr.v=19.722822 > > > +lavfi.psnr.mse_avg=393.064240 > > > +lavfi.psnr.psnr_avg=22.186169 > > > frame:2 pts:2 pts_time:2 > > > -lavfi.psnr.mse.y=234.79 > > > -lavfi.psnr.psnr.y=24.42 > > > -lavfi.psnr.mse.u=435.72 > > > -lavfi.psnr.psnr.u=21.74 > > > -lavfi.psnr.mse.v=699.60 > > > -lavfi.psnr.psnr.v=19.68 > > > -lavfi.psnr.mse_avg=401.23 > > > -lavfi.psnr.psnr_avg=22.10 > > > +lavfi.psnr.mse.y=230.470032 > > > +lavfi.psnr.psnr.y=24.504660 > > > +lavfi.psnr.mse.u=433.524109 > > > +lavfi.psnr.psnr.u=21.760672 > > > +lavfi.psnr.mse.v=693.391174 > > > +lavfi.psnr.psnr.v=19.721020 > > > +lavfi.psnr.mse_avg=396.963837 > > > +lavfi.psnr.psnr_avg=22.143293 > > > frame:3 pts:3 pts_time:3 > > > -lavfi.psnr.mse.y=250.88 > > > -lavfi.psnr.psnr.y=24.14 > > > -lavfi.psnr.mse.u=479.73 > > > -lavfi.psnr.psnr.u=21.32 > > > -lavfi.psnr.mse.v=707.55 > > > -lavfi.psnr.psnr.v=19.63 > > > -lavfi.psnr.mse_avg=422.26 > > > -lavfi.psnr.psnr_avg=21.88 > > > +lavfi.psnr.mse.y=247.346817 > > > +lavfi.psnr.psnr.y=24.197741 > > > +lavfi.psnr.mse.u=476.365723 > > > +lavfi.psnr.psnr.u=21.351398 > > > +lavfi.psnr.mse.v=700.987549 > > > +lavfi.psnr.psnr.v=19.673700 > > > +lavfi.psnr.mse_avg=418.011719 > > > +lavfi.psnr.psnr_avg=21.918919 > > > frame:4 pts:4 pts_time:4 > > > -lavfi.psnr.mse.y=241.05 > > > -lavfi.psnr.psnr.y=24.31 > > > -lavfi.psnr.mse.u=505.04 > > > -lavfi.psnr.psnr.u=21.10 > > > -lavfi.psnr.mse.v=716.00 > > > -lavfi.psnr.psnr.v=19.58 > > > -lavfi.psnr.mse_avg=425.79 > > > -lavfi.psnr.psnr_avg=21.84 > > > +lavfi.psnr.mse.y=237.129654 > > > +lavfi.psnr.psnr.y=24.380945 > > > +lavfi.psnr.mse.u=503.722931 > > > +lavfi.psnr.psnr.u=21.108887 > > > +lavfi.psnr.mse.v=708.932678 > > > +lavfi.psnr.psnr.v=19.624754 > > > +lavfi.psnr.mse_avg=421.728729 > > > +lavfi.psnr.psnr_avg=21.880472 > > > -- > > > 2.33.0 > > > > > > > Do you have some performance data after applying the faster algorithm > > in your test bed? I think the data will help others, thx > > > > Previous algorithm used floats, and did everything in 2 pass. > > This code is faster several times or have same speed as previous code(when > using big size values). > For small radius divisions are avoided with luts and this give biggest > speed up. > > I have number but they are only useful for my setup. Now filter is even > faster than heavily optimized gblur filter under clang-12. > I see, will try this patch in the local, thx _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".