From a403f93a8fa20ce0d7345d9a00d75bc90fe7d73f Mon Sep 17 00:00:00 2001 From: Muhammad Faiz <mfc...@gmail.com> Date: Tue, 13 Oct 2015 00:31:29 +0700 Subject: [PATCH] avcodec/fft: out of place permutation with av_fft_permute2
with optimization (more cache friendly) also optimize av_fft_permute machine specific code (unfortunately) is not touched speedup (at least on my machine, bits = 14): with av_fft_permute ~ 1.5x - 2x times with av_fft_permute2 ~ 2.5x - 3x times --- libavcodec/avfft.c | 5 +++++ libavcodec/avfft.h | 3 +++ libavcodec/fft.h | 2 ++ libavcodec/fft_template.c | 34 +++++++++++++++++++++++++++++----- libavcodec/version.h | 2 +- 5 files changed, 40 insertions(+), 6 deletions(-) diff --git a/libavcodec/avfft.c b/libavcodec/avfft.c index 675d2b9..6b33ab5 100644 --- a/libavcodec/avfft.c +++ b/libavcodec/avfft.c @@ -40,6 +40,11 @@ void av_fft_permute(FFTContext *s, FFTComplex *z) s->fft_permute(s, z); } +void av_fft_permute2(FFTContext *s, FFTComplex *dst, const FFTComplex *src) +{ + s->fft_permute2(s, dst, src); +} + void av_fft_calc(FFTContext *s, FFTComplex *z) { s->fft_calc(s, z); diff --git a/libavcodec/avfft.h b/libavcodec/avfft.h index 0c0f9b8..31d5420 100644 --- a/libavcodec/avfft.h +++ b/libavcodec/avfft.h @@ -52,6 +52,9 @@ FFTContext *av_fft_init(int nbits, int inverse); */ void av_fft_permute(FFTContext *s, FFTComplex *z); +/* out of place permutation */ +void av_fft_permute2(FFTContext *s, FFTComplex *dst, const FFTComplex *src); + /** * Do a complex FFT with the parameters defined in av_fft_init(). The * input data must be permuted before. No 1.0/sqrt(n) normalization is done. diff --git a/libavcodec/fft.h b/libavcodec/fft.h index 64f0f63..c7f2bdb 100644 --- a/libavcodec/fft.h +++ b/libavcodec/fft.h @@ -110,6 +110,8 @@ struct FFTContext { void (*mdct_calcw)(struct FFTContext *s, FFTDouble *output, const FFTSample *input); enum fft_permutation_type fft_permutation; enum mdct_permutation_type mdct_permutation; + /* out of place permutation */ + void (*fft_permute2)(struct FFTContext *s, FFTComplex *dst, const FFTComplex* src); }; #if CONFIG_HARDCODED_TABLES diff --git a/libavcodec/fft_template.c b/libavcodec/fft_template.c index 23ea453..00e652b 100644 --- a/libavcodec/fft_template.c +++ b/libavcodec/fft_template.c @@ -72,6 +72,8 @@ COSTABLE_CONST FFTSample * const FFT_NAME(ff_cos_tabs)[] = { #endif /* FFT_FIXED_32 */ static void fft_permute_c(FFTContext *s, FFTComplex *z); +static void fft_permute2_c(FFTContext *s, FFTComplex *dst, const FFTComplex *src); +static void fft_permute2_wrapper_c(FFTContext *s, FFTComplex *dst, const FFTComplex *src); static void fft_calc_c(FFTContext *s, FFTComplex *z); static int split_radix_permutation(int i, int n, int inverse) @@ -156,6 +158,7 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) s->fft_permutation = FF_FFT_PERM_DEFAULT; s->fft_permute = fft_permute_c; + s->fft_permute2= fft_permute2_c; s->fft_calc = fft_calc_c; #if CONFIG_MDCT s->imdct_calc = ff_imdct_calc_c; @@ -197,6 +200,9 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) } } + if (s->fft_permute != fft_permute_c && s->fft_permute2 == fft_permute2_c) + s->fft_permute2 = fft_permute2_wrapper_c; + return 0; fail: av_freep(&s->revtab); @@ -206,12 +212,30 @@ av_cold int ff_fft_init(FFTContext *s, int nbits, int inverse) static void fft_permute_c(FFTContext *s, FFTComplex *z) { - int j, np; + int n = 1 << s->nbits; + fft_permute2_c(s, s->tmp_buf, z); + memcpy(z, s->tmp_buf, n * sizeof(FFTComplex)); +} + +static void fft_permute2_c(FFTContext *s, FFTComplex *dst, const FFTComplex *src) +{ + int j, n, q; const uint16_t *revtab = s->revtab; - np = 1 << s->nbits; - /* TODO: handle split-radix permute in a more optimal way, probably in-place */ - for(j=0;j<np;j++) s->tmp_buf[revtab[j]] = z[j]; - memcpy(z, s->tmp_buf, np * sizeof(FFTComplex)); + n = 1 << s->nbits; + q = n >> 2; + for (j = 0; j < q; j++) { + dst[revtab[j ]] = src[j ]; + dst[revtab[j+ q]] = src[j+ q]; + dst[revtab[j+2*q]] = src[j+2*q]; + dst[revtab[j+3*q]] = src[j+3*q]; + } +} + +/* for fft_permute other than fft_permute_c */ +static void fft_permute2_wrapper_c(FFTContext *s, FFTComplex *dst, const FFTComplex *src) +{ + memcpy(dst, src, (1 << s->nbits) * sizeof(FFTComplex)); + s->fft_permute(s, dst); } av_cold void ff_fft_end(FFTContext *s) diff --git a/libavcodec/version.h b/libavcodec/version.h index c7fc1f1..953ff9f 100644 --- a/libavcodec/version.h +++ b/libavcodec/version.h @@ -29,7 +29,7 @@ #include "libavutil/version.h" #define LIBAVCODEC_VERSION_MAJOR 57 -#define LIBAVCODEC_VERSION_MINOR 5 +#define LIBAVCODEC_VERSION_MINOR 6 #define LIBAVCODEC_VERSION_MICRO 100 #define LIBAVCODEC_VERSION_INT AV_VERSION_INT(LIBAVCODEC_VERSION_MAJOR, \ -- 1.8.3.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel