On Mon, Aug 04, 2014 at 01:14:37AM +0200, Michael Niedermayer wrote: > On Mon, Aug 04, 2014 at 12:44:48AM +0200, Michael Niedermayer wrote: > > On Sun, Aug 03, 2014 at 10:27:21PM +0200, Clément Bœsch wrote: > > > This removes the avcodec dependency and make the code almost twice as > > > fast. More to come. > > > > > > The DCT factorization is based on "Fast and numerically stable > > > algorithms for discrete cosine transforms" from Gerlind Plonkaa & > > > Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015). > > > --- > > > configure | 2 - > > > libavfilter/vf_dctdnoiz.c | 328 > > > +++++++++++++++++++++++++++++++++------------- > > > 2 files changed, 240 insertions(+), 90 deletions(-) > > > > > > diff --git a/configure b/configure > > > index 9c3af50..6196b2a 100755 > > > --- a/configure > > > +++ b/configure > > > @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl" > > > bs2b_filter_deps="libbs2b" > > > colormatrix_filter_deps="gpl" > > > cropdetect_filter_deps="gpl" > > > -dctdnoiz_filter_deps="avcodec" > > > -dctdnoiz_filter_select="dct" > > > delogo_filter_deps="gpl" > > > deshake_filter_deps="avcodec" > > > deshake_filter_select="me_cmp" > > > diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c > > > index 71b8536..6d24934 100644 > > > --- a/libavfilter/vf_dctdnoiz.c > > > +++ b/libavfilter/vf_dctdnoiz.c > > > @@ -1,5 +1,5 @@ > > > /* > > > - * Copyright (c) 2013 Clément Bœsch > > > + * Copyright (c) 2013-2014 Clément Bœsch > > > * > > > * This file is part of FFmpeg. > > > * > > > @@ -23,7 +23,6 @@ > > > * @see http://www.ipol.im/pub/art/2011/ys-dct/ > > > */ > > > > > > -#include "libavcodec/avfft.h" > > > #include "libavutil/eval.h" > > > #include "libavutil/opt.h" > > > #include "drawutils.h" > > > @@ -35,7 +34,7 @@ > > > static const char *const var_names[] = { "c", NULL }; > > > enum { VAR_C, VAR_VARS_NB }; > > > > > > -typedef struct { > > > +typedef struct DCTdnoizContext { > > > const AVClass *class; > > > > > > /* coefficient factor expression */ > > > @@ -52,8 +51,9 @@ typedef struct { > > > int p_linesize; // line sizes for color and weights > > > int overlap; // number of block overlapping pixels > > > int step; // block step increment (BSIZE - overlap) > > > - DCTContext *dct, *idct; // DCT and inverse DCT contexts > > > - float *block, *tmp_block; // two BSIZE x BSIZE block buffers > > > + void (*filter_freq_func)(struct DCTdnoizContext *s, > > > + const float *src, int src_linesize, > > > + float *dst, int dst_linesize); > > > } DCTdnoizContext; > > > > > > #define OFFSET(x) offsetof(DCTdnoizContext, x) > > > @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = { > > > > > > AVFILTER_DEFINE_CLASS(dctdnoiz); > > > > > > -static float *dct_block(DCTdnoizContext *ctx, const float *src, int > > > src_linesize) > > > +static void av_always_inline fdct16_1d(float *dst, const float *src, > > > + int dst_stridea, int dst_strideb, > > > + int src_stridea, int src_strideb) > > > { > > > - int x, y; > > > - float *column; > > > - > > > - for (y = 0; y < BSIZE; y++) { > > > - float *line = ctx->block; > > > + int i; > > > > > > - memcpy(line, src, BSIZE * sizeof(*line)); > > > - src += src_linesize; > > > - av_dct_calc(ctx->dct, line); > > > - > > > - column = ctx->tmp_block + y; > > > - column[0] = line[0] * (1. / sqrt(BSIZE)); > > > - column += BSIZE; > > > - for (x = 1; x < BSIZE; x++) { > > > - *column = line[x] * sqrt(2. / BSIZE); > > > - column += BSIZE; > > > - } > > > + for (i = 0; i < BSIZE; i++) { > > > + const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea]; > > > + const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea]; > > > + const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea]; > > > + const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea]; > > > + const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea]; > > > + const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea]; > > > + const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea]; > > > + const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea]; > > > + const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea]; > > > + const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea]; > > > + const float x0_a = src[ 2*src_stridea] - src[13*src_stridea]; > > > + const float x0_b = src[ 3*src_stridea] - src[12*src_stridea]; > > > + const float x0_c = src[ 4*src_stridea] - src[11*src_stridea]; > > > + const float x0_d = src[ 5*src_stridea] - src[10*src_stridea]; > > > + const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea]; > > > + const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea]; > > > + const float x2_0 = x0_0 + x0_7; > > > + const float x2_1 = x0_1 + x0_6; > > > + const float x2_2 = x0_2 + x0_5; > > > + const float x2_3 = x0_3 + x0_4; > > > + const float x2_4 = x0_0 - x0_7; > > > + const float x2_5 = x0_1 - x0_6; > > > + const float x2_6 = x0_2 - x0_5; > > > + const float x2_7 = x0_3 - x0_4; > > > + const float x4_0 = x2_0 + x2_3; > > > + const float x4_1 = x2_1 + x2_2; > > > + const float x4_2 = x2_0 - x2_3; > > > + const float x4_3 = x2_1 - x2_2; > > > + const float x5_0 = x4_0 + x4_1; > > > + const float x5_1 = x4_0 - x4_1; > > > + const float x5_2 = 1.306562964876380*x4_2 + > > > 0.541196100146197*x4_3; > > > + const float x5_3 = 0.541196100146197*x4_2 - > > > 1.306562964876380*x4_3; > > > + const float x6_0 = 1.387039845322150*x2_4 + > > > 0.275899379282943*x2_7; > > > + const float x6_1 = 1.175875602419360*x2_5 + > > > 0.785694958387102*x2_6; > > > + const float x6_2 = -0.785694958387102*x2_5 + > > > 1.175875602419360*x2_6; > > > + const float x6_3 = 0.275899379282943*x2_4 - > > > 1.387039845322150*x2_7; > > > + const float x7_0 = x6_0 + x6_1; > > > + const float x7_1 = x6_0 - x6_1; > > > + const float x7_2 = x6_2 + x6_3; > > > + const float x7_3 = x6_2 - x6_3; > > > + const float x3_5 = 0.707106781186547*x7_1 - > > > 0.707106781186547*x7_3; > > > + const float x3_6 = 0.707106781186547*x7_1 + > > > 0.707106781186547*x7_3; > > > + const float x8_0 = 1.407403737526380*x0_8 + > > > 0.138617169199091*x0_f; > > > + const float x8_1 = 1.353318001174350*x0_9 + > > > 0.410524527522357*x0_e; > > > + const float x8_2 = 1.247225012986670*x0_a + > > > 0.666655658477747*x0_d; > > > + const float x8_3 = 1.093201867001760*x0_b + > > > 0.897167586342636*x0_c; > > > + const float x8_4 = -0.897167586342636*x0_b + > > > 1.093201867001760*x0_c; > > > + const float x8_5 = 0.666655658477747*x0_a - > > > 1.247225012986670*x0_d; > > > + const float x8_6 = -0.410524527522357*x0_9 + > > > 1.353318001174350*x0_e; > > > + const float x8_7 = 0.138617169199091*x0_8 - > > > 1.407403737526380*x0_f; > > > + const float xa_0 = x8_0 + x8_3; > > > + const float xa_1 = x8_1 + x8_2; > > > + const float xa_2 = x8_0 - x8_3; > > > + const float xa_3 = x8_1 - x8_2; > > > + const float xb_0 = xa_0 + xa_1; > > > + const float xb_1 = xa_0 - xa_1; > > > + const float xb_2 = 1.306562964876380*xa_2 + > > > 0.541196100146197*xa_3; > > > + const float xb_3 = 0.541196100146197*xa_2 - > > > 1.306562964876380*xa_3; > > > + const float xc_0 = x8_4 + x8_7; > > > + const float xc_1 = x8_5 + x8_6; > > > + const float xc_2 = x8_4 - x8_7; > > > + const float xc_3 = x8_5 - x8_6; > > > + const float xd_0 = xc_0 + xc_1; > > > + const float xd_1 = xc_0 - xc_1; > > > + const float xd_2 = 1.306562964876380*xc_2 + > > > 0.541196100146197*xc_3; > > > + const float xd_3 = 0.541196100146197*xc_2 - > > > 1.306562964876380*xc_3; > > > + const float x1_9 = 0.707106781186547*xb_2 - > > > 0.707106781186547*xd_3; > > > + const float x1_a = 0.707106781186547*xb_2 + > > > 0.707106781186547*xd_3; > > > + const float x1_b = 0.707106781186547*xb_1 + > > > 0.707106781186547*xd_1; > > > + const float x1_c = 0.707106781186547*xb_1 - > > > 0.707106781186547*xd_1; > > > + const float x1_d = 0.707106781186547*xb_3 - > > > 0.707106781186547*xd_2; > > > + const float x1_e = 0.707106781186547*xb_3 + > > > 0.707106781186547*xd_2; > > > + dst[ 0*dst_stridea] = 0.25*x5_0; > > > + dst[ 1*dst_stridea] = 0.25*xb_0; > > > + dst[ 2*dst_stridea] = 0.25*x7_0; > > > + dst[ 3*dst_stridea] = 0.25*x1_9; > > > + dst[ 4*dst_stridea] = 0.25*x5_2; > > > + dst[ 5*dst_stridea] = 0.25*x1_a; > > > + dst[ 6*dst_stridea] = 0.25*x3_5; > > > + dst[ 7*dst_stridea] = 0.25*x1_b; > > > + dst[ 8*dst_stridea] = 0.25*x5_1; > > > + dst[ 9*dst_stridea] = 0.25*x1_c; > > > + dst[10*dst_stridea] = 0.25*x3_6; > > > + dst[11*dst_stridea] = 0.25*x1_d; > > > + dst[12*dst_stridea] = 0.25*x5_3; > > > + dst[13*dst_stridea] = 0.25*x1_e; > > > + dst[14*dst_stridea] = 0.25*x7_2; > > > + dst[15*dst_stridea] = 0.25*xd_0; > > > > many of these multiplies look like they can be merged into other > > multiplies > > > > for example see: > > > > > > const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3; > > const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3; > > const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2; > > const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2; > > dst[11*dst_stridea] = 0.25*x1_d; > > dst[13*dst_stridea] = 0.25*x1_e; > > > > vs. > > > > const float xd_2 = (0.25*0.707106781186547*1.306562964876380)*xc_2 + > > (0.25*0.707106781186547*0.541196100146197)*xc_3; > > const float xb_3 = (0.25*0.707106781186547*0.541196100146197)*xa_2 - > > (0.25*0.707106781186547*1.306562964876380)*xa_3; > > dst[11*dst_stridea] = xb_3 - xd_2; > > dst[13*dst_stridea] = xb_3 + xd_2; > > also more generally > if you have 2 stages of butterflies each with 4 multiplies and 2 adds > in each butterfly > > a----\-/--\---/----------a' > X \ / > b----/-\----------\---/--b' > / \ \ / > c----\-/--/---\----------c' > X / \ > d----/-\----------/---\--d' > > > of additions > the first stage can scale their output arbitrarily for free by > changing the respective coefficients > the second stage can use any scaled input for free by adjusting their > coefficients similarly, this gives you 4 free parameters in the > example above which can be > choosen so as to make some coefficients trivial like +-1.0 > this also works accorss 2D (I)DCTs or with other things before or > after the (i)dct which can absorb such rescaling
also i suggest that the patch is applied before time is spend optimizing it further, also the moving around of multiplies can probably affect numerical stability if overdone [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB Good people do not need laws to tell them to act responsibly, while bad people will find a way around the laws. -- Plato
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel