On Aug 3, 2014 1:27 PM, "Clément Bœsch" <u...@pkh.me> wrote: > > This removes the avcodec dependency and make the code almost twice as > fast. More to come. > > The DCT factorization is based on "Fast and numerically stable > algorithms for discrete cosine transforms" from Gerlind Plonkaa & > Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015). > --- > configure | 2 - > libavfilter/vf_dctdnoiz.c | 328 +++++++++++++++++++++++++++++++++------------- > 2 files changed, 240 insertions(+), 90 deletions(-)
This change warrants a Changelog entry. > > diff --git a/configure b/configure > index 9c3af50..6196b2a 100755 > --- a/configure > +++ b/configure > @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl" > bs2b_filter_deps="libbs2b" > colormatrix_filter_deps="gpl" > cropdetect_filter_deps="gpl" > -dctdnoiz_filter_deps="avcodec" > -dctdnoiz_filter_select="dct" > delogo_filter_deps="gpl" > deshake_filter_deps="avcodec" > deshake_filter_select="me_cmp" > diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c > index 71b8536..6d24934 100644 > --- a/libavfilter/vf_dctdnoiz.c > +++ b/libavfilter/vf_dctdnoiz.c > @@ -1,5 +1,5 @@ > /* > - * Copyright (c) 2013 Clément Bœsch > + * Copyright (c) 2013-2014 Clément Bœsch > * > * This file is part of FFmpeg. > * > @@ -23,7 +23,6 @@ > * @see http://www.ipol.im/pub/art/2011/ys-dct/ > */ > > -#include "libavcodec/avfft.h" > #include "libavutil/eval.h" > #include "libavutil/opt.h" > #include "drawutils.h" > @@ -35,7 +34,7 @@ > static const char *const var_names[] = { "c", NULL }; > enum { VAR_C, VAR_VARS_NB }; > > -typedef struct { > +typedef struct DCTdnoizContext { > const AVClass *class; > > /* coefficient factor expression */ > @@ -52,8 +51,9 @@ typedef struct { > int p_linesize; // line sizes for color and weights > int overlap; // number of block overlapping pixels > int step; // block step increment (BSIZE - overlap) > - DCTContext *dct, *idct; // DCT and inverse DCT contexts > - float *block, *tmp_block; // two BSIZE x BSIZE block buffers > + void (*filter_freq_func)(struct DCTdnoizContext *s, > + const float *src, int src_linesize, > + float *dst, int dst_linesize); > } DCTdnoizContext; > > #define OFFSET(x) offsetof(DCTdnoizContext, x) > @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = { > > AVFILTER_DEFINE_CLASS(dctdnoiz); > > -static float *dct_block(DCTdnoizContext *ctx, const float *src, int src_linesize) > +static void av_always_inline fdct16_1d(float *dst, const float *src, > + int dst_stridea, int dst_strideb, > + int src_stridea, int src_strideb) > { > - int x, y; > - float *column; > - > - for (y = 0; y < BSIZE; y++) { > - float *line = ctx->block; > + int i; > > - memcpy(line, src, BSIZE * sizeof(*line)); > - src += src_linesize; > - av_dct_calc(ctx->dct, line); > - > - column = ctx->tmp_block + y; > - column[0] = line[0] * (1. / sqrt(BSIZE)); > - column += BSIZE; > - for (x = 1; x < BSIZE; x++) { > - *column = line[x] * sqrt(2. / BSIZE); > - column += BSIZE; > - } > + for (i = 0; i < BSIZE; i++) { > + const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea]; > + const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea]; > + const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea]; > + const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea]; > + const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea]; > + const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea]; > + const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea]; > + const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea]; > + const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea]; > + const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea]; > + const float x0_a = src[ 2*src_stridea] - src[13*src_stridea]; > + const float x0_b = src[ 3*src_stridea] - src[12*src_stridea]; > + const float x0_c = src[ 4*src_stridea] - src[11*src_stridea]; > + const float x0_d = src[ 5*src_stridea] - src[10*src_stridea]; > + const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea]; > + const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea]; > + const float x2_0 = x0_0 + x0_7; > + const float x2_1 = x0_1 + x0_6; > + const float x2_2 = x0_2 + x0_5; > + const float x2_3 = x0_3 + x0_4; > + const float x2_4 = x0_0 - x0_7; > + const float x2_5 = x0_1 - x0_6; > + const float x2_6 = x0_2 - x0_5; > + const float x2_7 = x0_3 - x0_4; > + const float x4_0 = x2_0 + x2_3; > + const float x4_1 = x2_1 + x2_2; > + const float x4_2 = x2_0 - x2_3; > + const float x4_3 = x2_1 - x2_2; > + const float x5_0 = x4_0 + x4_1; > + const float x5_1 = x4_0 - x4_1; > + const float x5_2 = 1.306562964876380*x4_2 + 0.541196100146197*x4_3; > + const float x5_3 = 0.541196100146197*x4_2 - 1.306562964876380*x4_3; > + const float x6_0 = 1.387039845322150*x2_4 + 0.275899379282943*x2_7; > + const float x6_1 = 1.175875602419360*x2_5 + 0.785694958387102*x2_6; > + const float x6_2 = -0.785694958387102*x2_5 + 1.175875602419360*x2_6; > + const float x6_3 = 0.275899379282943*x2_4 - 1.387039845322150*x2_7; > + const float x7_0 = x6_0 + x6_1; > + const float x7_1 = x6_0 - x6_1; > + const float x7_2 = x6_2 + x6_3; > + const float x7_3 = x6_2 - x6_3; > + const float x3_5 = 0.707106781186547*x7_1 - 0.707106781186547*x7_3; > + const float x3_6 = 0.707106781186547*x7_1 + 0.707106781186547*x7_3; > + const float x8_0 = 1.407403737526380*x0_8 + 0.138617169199091*x0_f; > + const float x8_1 = 1.353318001174350*x0_9 + 0.410524527522357*x0_e; > + const float x8_2 = 1.247225012986670*x0_a + 0.666655658477747*x0_d; > + const float x8_3 = 1.093201867001760*x0_b + 0.897167586342636*x0_c; > + const float x8_4 = -0.897167586342636*x0_b + 1.093201867001760*x0_c; > + const float x8_5 = 0.666655658477747*x0_a - 1.247225012986670*x0_d; > + const float x8_6 = -0.410524527522357*x0_9 + 1.353318001174350*x0_e; > + const float x8_7 = 0.138617169199091*x0_8 - 1.407403737526380*x0_f; > + const float xa_0 = x8_0 + x8_3; > + const float xa_1 = x8_1 + x8_2; > + const float xa_2 = x8_0 - x8_3; > + const float xa_3 = x8_1 - x8_2; > + const float xb_0 = xa_0 + xa_1; > + const float xb_1 = xa_0 - xa_1; > + const float xb_2 = 1.306562964876380*xa_2 + 0.541196100146197*xa_3; > + const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3; > + const float xc_0 = x8_4 + x8_7; > + const float xc_1 = x8_5 + x8_6; > + const float xc_2 = x8_4 - x8_7; > + const float xc_3 = x8_5 - x8_6; > + const float xd_0 = xc_0 + xc_1; > + const float xd_1 = xc_0 - xc_1; > + const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3; > + const float xd_3 = 0.541196100146197*xc_2 - 1.306562964876380*xc_3; > + const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3; > + const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3; > + const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1; > + const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1; > + const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2; > + const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2; > + dst[ 0*dst_stridea] = 0.25*x5_0; > + dst[ 1*dst_stridea] = 0.25*xb_0; > + dst[ 2*dst_stridea] = 0.25*x7_0; > + dst[ 3*dst_stridea] = 0.25*x1_9; > + dst[ 4*dst_stridea] = 0.25*x5_2; > + dst[ 5*dst_stridea] = 0.25*x1_a; > + dst[ 6*dst_stridea] = 0.25*x3_5; > + dst[ 7*dst_stridea] = 0.25*x1_b; > + dst[ 8*dst_stridea] = 0.25*x5_1; > + dst[ 9*dst_stridea] = 0.25*x1_c; > + dst[10*dst_stridea] = 0.25*x3_6; > + dst[11*dst_stridea] = 0.25*x1_d; > + dst[12*dst_stridea] = 0.25*x5_3; > + dst[13*dst_stridea] = 0.25*x1_e; > + dst[14*dst_stridea] = 0.25*x7_2; > + dst[15*dst_stridea] = 0.25*xd_0; > + dst += dst_strideb; > + src += src_strideb; > } > +} > > - column = ctx->tmp_block; > - for (x = 0; x < BSIZE; x++) { > - av_dct_calc(ctx->dct, column); > - column[0] *= 1. / sqrt(BSIZE); > - for (y = 1; y < BSIZE; y++) > - column[y] *= sqrt(2. / BSIZE); > - column += BSIZE; > +static void av_always_inline idct16_1d(float *dst, const float *src, > + int dst_stridea, int dst_strideb, > + int src_stridea, int src_strideb, > + int add) > +{ > + int i; > + > + for (i = 0; i < BSIZE; i++) { > + const float x0_0 = 1.414213562373100*src[ 0*src_stridea]; > + const float x0_1 = 1.407403737526380*src[ 1*src_stridea] + 0.138617169199091*src[15*src_stridea]; > + const float x0_2 = 1.387039845322150*src[ 2*src_stridea] + 0.275899379282943*src[14*src_stridea]; > + const float x0_3 = 1.353318001174350*src[ 3*src_stridea] + 0.410524527522357*src[13*src_stridea]; > + const float x0_4 = 1.306562964876380*src[ 4*src_stridea] + 0.541196100146197*src[12*src_stridea]; > + const float x0_5 = 1.247225012986670*src[ 5*src_stridea] + 0.666655658477747*src[11*src_stridea]; > + const float x0_6 = 1.175875602419360*src[ 6*src_stridea] + 0.785694958387102*src[10*src_stridea]; > + const float x0_7 = 1.093201867001760*src[ 7*src_stridea] + 0.897167586342636*src[ 9*src_stridea]; > + const float x0_8 = 1.414213562373100*src[ 8*src_stridea]; > + const float x0_9 = -0.897167586342636*src[ 7*src_stridea] + 1.093201867001760*src[ 9*src_stridea]; > + const float x0_a = 0.785694958387102*src[ 6*src_stridea] - 1.175875602419360*src[10*src_stridea]; > + const float x0_b = -0.666655658477747*src[ 5*src_stridea] + 1.247225012986670*src[11*src_stridea]; > + const float x0_c = 0.541196100146197*src[ 4*src_stridea] - 1.306562964876380*src[12*src_stridea]; > + const float x0_d = -0.410524527522357*src[ 3*src_stridea] + 1.353318001174350*src[13*src_stridea]; > + const float x0_e = 0.275899379282943*src[ 2*src_stridea] - 1.387039845322150*src[14*src_stridea]; > + const float x0_f = -0.138617169199091*src[ 1*src_stridea] + 1.407403737526380*src[15*src_stridea]; > + const float x2_0 = x0_0 + x0_8; > + const float x2_1 = x0_1 + x0_7; > + const float x2_2 = x0_2 + x0_6; > + const float x2_3 = x0_3 + x0_5; > + const float x2_4 = 1.4142135623731*x0_4; > + const float x2_5 = x0_0 - x0_8; > + const float x2_6 = x0_1 - x0_7; > + const float x2_7 = x0_2 - x0_6; > + const float x2_8 = x0_3 - x0_5; > + const float x4_0 = x2_0 + x2_4; > + const float x4_1 = x2_1 + x2_3; > + const float x4_2 = 1.4142135623731*x2_2; > + const float x4_3 = x2_0 - x2_4; > + const float x4_4 = x2_1 - x2_3; > + const float x5_0 = 0.5*x4_0 + 0.707106781186548*x4_1 + 0.5*x4_2; > + const float x5_1 = 0.707106781186548*x4_0 - 0.707106781186548*x4_2; > + const float x5_2 = 0.5*x4_0 - 0.707106781186548*x4_1 + 0.5*x4_2; > + const float x5_3 = 0.707106781186547*x4_3 + 0.707106781186547*x4_4; > + const float x5_4 = 0.707106781186547*x4_3 - 0.707106781186547*x4_4; > + const float x6_0 = 1.4142135623731*x2_5; > + const float x6_1 = 1.30656296487638*x2_6 + 0.541196100146197*x2_8; > + const float x6_2 = 1.4142135623731*x2_7; > + const float x6_3 = -0.541196100146197*x2_6 + 1.30656296487638*x2_8; > + const float x7_0 = 0.5*x6_0 + 0.707106781186548*x6_1 + 0.5*x6_2; > + const float x7_1 = 0.707106781186548*x6_0 - 0.707106781186548*x6_2; > + const float x7_2 = 0.5*x6_0 - 0.707106781186548*x6_1 + 0.5*x6_2; > + const float x3_6 = 0.707106781186547*x7_1 - 0.707106781186547*x6_3; > + const float x3_7 = 0.707106781186547*x7_1 + 0.707106781186547*x6_3; > + const float x8_0 = 1.4142135623731*x0_c; > + const float x8_1 = x0_b + x0_d; > + const float x8_2 = x0_a + x0_e; > + const float x8_3 = x0_9 + x0_f; > + const float x8_4 = x0_9 - x0_f; > + const float x8_5 = x0_a - x0_e; > + const float x8_6 = x0_b - x0_d; > + const float xa_0 = 1.4142135623731*x8_0; > + const float xa_1 = 1.30656296487638*x8_1 + 0.541196100146197*x8_3; > + const float xa_2 = 1.4142135623731*x8_2; > + const float xa_3 = -0.541196100146197*x8_1 + 1.30656296487638*x8_3; > + const float xb_0 = 0.5*xa_0 + 0.707106781186548*xa_1 + 0.5*xa_2; > + const float xb_1 = 0.707106781186548*xa_0 - 0.707106781186548*xa_2; > + const float xb_2 = 0.5*xa_0 - 0.707106781186548*xa_1 + 0.5*xa_2; > + const float x9_1 = 0.707106781186547*xb_1 - 0.707106781186547*xa_3; > + const float x9_2 = 0.707106781186547*xb_1 + 0.707106781186547*xa_3; > + const float xc_0 = 1.4142135623731*x8_5; > + const float xc_1 = x8_4 + x8_6; > + const float xc_2 = x8_4 - x8_6; > + const float xd_0 = 0.707106781186547*xc_0 + 0.707106781186547*xc_1; > + const float xd_1 = 0.707106781186547*xc_0 - 0.707106781186547*xc_1; > + const float x9_6 = -xd_1; > + const float x1_b = -x9_1; > + const float x1_f = -xb_2; > + dst[ 0*dst_stridea] = (add ? dst[ 0*dst_stridea] : 0) + 0.353553390593274*x5_0; > + dst[ 1*dst_stridea] = (add ? dst[ 1*dst_stridea] : 0) + 0.25*x7_0 - 0.25*x1_f; > + dst[ 2*dst_stridea] = (add ? dst[ 2*dst_stridea] : 0) + 0.25*x7_0 + 0.25*x1_f; > + dst[ 3*dst_stridea] = (add ? dst[ 3*dst_stridea] : 0) + 0.25*x5_3 + 0.25*x9_6; > + dst[ 4*dst_stridea] = (add ? dst[ 4*dst_stridea] : 0) + 0.25*x5_3 - 0.25*x9_6; > + dst[ 5*dst_stridea] = (add ? dst[ 5*dst_stridea] : 0) + 0.25*x3_6 - 0.25*x9_2; > + dst[ 6*dst_stridea] = (add ? dst[ 6*dst_stridea] : 0) + 0.25*x3_6 + 0.25*x9_2; > + dst[ 7*dst_stridea] = (add ? dst[ 7*dst_stridea] : 0) + 0.25*x5_1 + 0.25*xc_2; > + dst[ 8*dst_stridea] = (add ? dst[ 8*dst_stridea] : 0) + 0.25*x5_1 - 0.25*xc_2; > + dst[ 9*dst_stridea] = (add ? dst[ 9*dst_stridea] : 0) + 0.25*x3_7 - 0.25*x1_b; > + dst[10*dst_stridea] = (add ? dst[10*dst_stridea] : 0) + 0.25*x3_7 + 0.25*x1_b; > + dst[11*dst_stridea] = (add ? dst[11*dst_stridea] : 0) + 0.25*x5_4 + 0.25*xd_0; > + dst[12*dst_stridea] = (add ? dst[12*dst_stridea] : 0) + 0.25*x5_4 - 0.25*xd_0; > + dst[13*dst_stridea] = (add ? dst[13*dst_stridea] : 0) + 0.25*x7_2 - 0.25*xb_0; > + dst[14*dst_stridea] = (add ? dst[14*dst_stridea] : 0) + 0.25*x7_2 + 0.25*xb_0; > + dst[15*dst_stridea] = (add ? dst[15*dst_stridea] : 0) + 0.353553390593274*x5_2; > + dst += dst_strideb; > + src += src_strideb; > } > +} > > - for (y = 0; y < BSIZE; y++) > - for (x = 0; x < BSIZE; x++) > - ctx->block[y*BSIZE + x] = ctx->tmp_block[x*BSIZE + y]; > +static av_always_inline void filter_freq(const float *src, int src_linesize, > + float *dst, int dst_linesize, > + AVExpr *expr, double *var_values, > + int sigma_th) > +{ > + unsigned i; > + DECLARE_ALIGNED(32, float, tmp_block1)[BSIZE * BSIZE]; > + DECLARE_ALIGNED(32, float, tmp_block2)[BSIZE * BSIZE]; > + > + /* forward DCT */ > + fdct16_1d(tmp_block1, src, 1, BSIZE, 1, src_linesize); > + fdct16_1d(tmp_block2, tmp_block1, BSIZE, 1, BSIZE, 1); > + > + for (i = 0; i < BSIZE*BSIZE; i++) { > + float *b = &tmp_block2[i]; > + /* frequency filtering */ > + if (expr) { > + var_values[VAR_C] = FFABS(*b); > + *b *= av_expr_eval(expr, var_values, NULL); > + } else { > + if (FFABS(*b) < sigma_th) > + *b = 0; > + } > + } > > - return ctx->block; > + /* inverse DCT */ > + idct16_1d(tmp_block1, tmp_block2, 1, BSIZE, 1, BSIZE, 0); > + idct16_1d(dst, tmp_block1, dst_linesize, 1, BSIZE, 1, 1); > } > > -static void idct_block(DCTdnoizContext *ctx, float *dst, int dst_linesize) > +static void filter_freq_sigma(DCTdnoizContext *s, > + const float *src, int src_linesize, > + float *dst, int dst_linesize) > { > - int x, y; > - float *block = ctx->block; > - float *tmp = ctx->tmp_block; > - > - for (y = 0; y < BSIZE; y++) { > - block[0] *= sqrt(BSIZE); > - for (x = 1; x < BSIZE; x++) > - block[x] *= 1./sqrt(2. / BSIZE); > - av_dct_calc(ctx->idct, block); > - block += BSIZE; > - } > + filter_freq(src, src_linesize, dst, dst_linesize, NULL, NULL, s->th); > +} > > - block = ctx->block; > - for (y = 0; y < BSIZE; y++) { > - tmp[0] = block[y] * sqrt(BSIZE); > - for (x = 1; x < BSIZE; x++) > - tmp[x] = block[x*BSIZE + y] * (1./sqrt(2. / BSIZE)); > - av_dct_calc(ctx->idct, tmp); > - for (x = 0; x < BSIZE; x++) > - dst[x*dst_linesize + y] += tmp[x]; > - } > +static void filter_freq_expr(DCTdnoizContext *s, > + const float *src, int src_linesize, > + float *dst, int dst_linesize) > +{ > + filter_freq(src, src_linesize, dst, dst_linesize, s->expr, s->var_values, 0); > } > > static int config_input(AVFilterLink *inlink) > @@ -194,18 +373,13 @@ static av_cold int init(AVFilterContext *ctx) > NULL, NULL, NULL, NULL, 0, ctx); > if (ret < 0) > return ret; > + s->filter_freq_func = filter_freq_expr; > + } else { > + s->filter_freq_func = filter_freq_sigma; > } Missing "if (s->expr)"? > > s->th = s->sigma * 3.; > s->step = BSIZE - s->overlap; > - s->dct = av_dct_init(NBITS, DCT_II); > - s->idct = av_dct_init(NBITS, DCT_III); > - s->block = av_malloc(BSIZE * BSIZE * sizeof(*s->block)); > - s->tmp_block = av_malloc(BSIZE * BSIZE * sizeof(*s->tmp_block)); > - > - if (!s->dct || !s->idct || !s->tmp_block || !s->block) > - return AVERROR(ENOMEM); > - > return 0; > } > > @@ -272,7 +446,7 @@ static void filter_plane(AVFilterContext *ctx, > const float *src, int src_linesize, > int w, int h) > { > - int x, y, bx, by; > + int x, y; > DCTdnoizContext *s = ctx->priv; > float *dst0 = dst; > const float *weights = s->weights; > @@ -282,27 +456,9 @@ static void filter_plane(AVFilterContext *ctx, > > // block dct sums > for (y = 0; y < h - BSIZE + 1; y += s->step) { > - for (x = 0; x < w - BSIZE + 1; x += s->step) { > - float *ftb = dct_block(s, src + x, src_linesize); > - > - if (s->expr) { > - for (by = 0; by < BSIZE; by++) { > - for (bx = 0; bx < BSIZE; bx++) { > - s->var_values[VAR_C] = FFABS(*ftb); > - *ftb++ *= av_expr_eval(s->expr, s->var_values, s); > - } > - } > - } else { > - for (by = 0; by < BSIZE; by++) { > - for (bx = 0; bx < BSIZE; bx++) { > - if (FFABS(*ftb) < s->th) > - *ftb = 0; > - ftb++; > - } > - } > - } > - idct_block(s, dst + x, dst_linesize); > - } > + for (x = 0; x < w - BSIZE + 1; x += s->step) > + s->filter_freq_func(s, src + x, src_linesize, > + dst + x, dst_linesize); > src += s->step * src_linesize; > dst += s->step * dst_linesize; > } > @@ -388,10 +544,6 @@ static av_cold void uninit(AVFilterContext *ctx) > int i; > DCTdnoizContext *s = ctx->priv; > > - av_dct_end(s->dct); > - av_dct_end(s->idct); > - av_free(s->block); > - av_free(s->tmp_block); > av_free(s->weights); > for (i = 0; i < 2; i++) { > av_free(s->cbuf[i][0]); > -- > 2.0.2 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel