use intrinsic cqt_time: plain = 3.286 s SSE = 1.725 s SSE3 = 1.692 s AVX = 1.399 s
Signed-off-by: Muhammad Faiz <mfc...@gmail.com> --- libavfilter/avf_showcqt.c | 7 + libavfilter/avf_showcqt.h | 4 + libavfilter/x86/Makefile | 1 + libavfilter/x86/avf_showcqt.c | 289 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 301 insertions(+) create mode 100644 libavfilter/x86/avf_showcqt.c diff --git a/libavfilter/avf_showcqt.c b/libavfilter/avf_showcqt.c index 8928bfb..2d2644c 100644 --- a/libavfilter/avf_showcqt.c +++ b/libavfilter/avf_showcqt.c @@ -320,6 +320,9 @@ static int init_cqt(ShowCQTContext *s) w *= sign * (1.0 / s->fft_len); s->coeffs[m].val[x - s->coeffs[m].start] = w; } + + if (s->permute_coeffs) + s->permute_coeffs(s->coeffs[m].val, s->coeffs[m].len); } av_expr_free(expr); @@ -1189,6 +1192,10 @@ static int config_output(AVFilterLink *outlink) s->update_sono = update_sono_yuv; } + /* arch specific initialization */ + if (ARCH_X86) + ff_showcqt_init_x86(s); + if ((ret = init_cqt(s)) < 0) return ret; diff --git a/libavfilter/avf_showcqt.h b/libavfilter/avf_showcqt.h index b945f49..d01d90a 100644 --- a/libavfilter/avf_showcqt.h +++ b/libavfilter/avf_showcqt.h @@ -79,6 +79,8 @@ typedef struct { void (*draw_axis)(AVFrame *out, AVFrame *axis, const ColorFloat *c, int off); void (*draw_sono)(AVFrame *out, AVFrame *sono, int off, int idx); void (*update_sono)(AVFrame *sono, const ColorFloat *c, int idx); + /* permute callback, for easier SIMD code */ + void (*permute_coeffs)(float *val, int len); /* performance debugging */ int64_t fft_time; int64_t cqt_time; @@ -112,4 +114,6 @@ typedef struct { int axis; } ShowCQTContext; +void ff_showcqt_init_x86(ShowCQTContext *s); + #endif diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 33de380..9633a7f 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_PP7_FILTER) += x86/vf_pp7_init.o OBJS-$(CONFIG_PSNR_FILTER) += x86/vf_psnr_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o OBJS-$(CONFIG_REMOVEGRAIN_FILTER) += x86/vf_removegrain_init.o +OBJS-$(CONFIG_SHOWCQT_FILTER) += x86/avf_showcqt.o OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o diff --git a/libavfilter/x86/avf_showcqt.c b/libavfilter/x86/avf_showcqt.c new file mode 100644 index 0000000..b8e9d32 --- /dev/null +++ b/libavfilter/x86/avf_showcqt.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2016 Muhammad Faiz <mfc...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/x86/intrinsic.h" +#include "libavfilter/avf_showcqt.h" + +#if HAVE_SSE_INTRINSIC +#include <xmmintrin.h> +#endif + +#if HAVE_SSE3_INTRINSIC +#include <pmmintrin.h> +#endif + +#if HAVE_AVX_INTRINSIC +#include <immintrin.h> +#endif + +#define CALCULATE(z) \ +do { \ + u = _mm_load_ps(coeffs[k+z].val + x); \ + i = coeffs[k+z].start + x; \ + j = fft_len - i; \ + m = _mm_load_ps(&src[i].re); \ + n = _mm_load_ps(&src[i+2].re); \ + m_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \ + m_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \ + m = _mm_loadu_ps(&src[j-1].re); \ + n = _mm_loadu_ps(&src[j-3].re); \ + n_re = _mm_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \ + n_im = _mm_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \ + a_re[z] = _mm_add_ps(a_re[z], _mm_mul_ps(u, m_re)); \ + a_im[z] = _mm_add_ps(a_im[z], _mm_mul_ps(u, m_im)); \ + b_re[z] = _mm_add_ps(b_re[z], _mm_mul_ps(u, n_re)); \ + b_im[z] = _mm_add_ps(b_im[z], _mm_mul_ps(u, n_im)); \ +} while (0) + +#if HAVE_SSE_INTRINSIC +static av_intrinsic_sse +void cqt_calc_sse(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, + int len, int fft_len) +{ + int k, x, i, j, coeffs_len; + __m128 result[2]; + __m128 l_re[2], l_im[2]; + __m128 r_re[2], r_im[2]; + __m128 a_re[2], a_im[2]; + __m128 b_re[2], b_im[2]; + __m128 m, n; + __m128 m_re, m_im; + __m128 n_re, n_im; + __m128 u; + + for (k = 0; k < len; k += 2) { + a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps(); + b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps(); + + coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len); + for (x = 0; x < coeffs_len; x += 4) { + CALCULATE(0); + CALCULATE(1); + } + + coeffs_len = coeffs[k].len; + for ( ; x < coeffs_len; x += 4) + CALCULATE(0); + + coeffs_len = coeffs[k+1].len; + for ( ; x < coeffs_len; x += 4) + CALCULATE(1); + + /* separate left and right, (and multiply by 2.0) */ +#define SEPARATE(z) \ +do { \ + l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \ + l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \ + r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \ + r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \ + m = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(2,0,2,0)); \ + n = _mm_shuffle_ps(l_re[z], l_im[z], _MM_SHUFFLE(3,1,3,1)); \ + l_re[z] = _mm_add_ps(m, n); \ + m = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(2,0,2,0)); \ + n = _mm_shuffle_ps(r_re[z], r_im[z], _MM_SHUFFLE(3,1,3,1)); \ + r_re[z] = _mm_add_ps(m, n); \ + m = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(2,0,2,0)); \ + n = _mm_shuffle_ps(l_re[z], r_re[z], _MM_SHUFFLE(3,1,3,1)); \ + l_re[z] = _mm_add_ps(m, n); \ + result[z] = _mm_mul_ps(l_re[z], l_re[z]); \ +} while (0) + SEPARATE(0); + SEPARATE(1); +#undef SEPARATE + m = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(2,0,2,0)); + n = _mm_shuffle_ps(result[0], result[1], _MM_SHUFFLE(3,1,3,1)); + _mm_store_ps(&dst[k].re, _mm_add_ps(m, n)); + } +} +#endif + +#if HAVE_SSE3_INTRINSIC +static av_intrinsic_sse3 +void cqt_calc_sse3(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, + int len, int fft_len) +{ + int k, x, i, j, coeffs_len; + __m128 result[2]; + __m128 l_re[2], l_im[2]; + __m128 r_re[2], r_im[2]; + __m128 a_re[2], a_im[2]; + __m128 b_re[2], b_im[2]; + __m128 m, n; + __m128 m_re, m_im; + __m128 n_re, n_im; + __m128 u; + + for (k = 0; k < len; k += 2) { + a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm_setzero_ps(); + b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm_setzero_ps(); + + coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len); + for (x = 0; x < coeffs_len; x += 4) { + CALCULATE(0); + CALCULATE(1); + } + + coeffs_len = coeffs[k].len; + for ( ; x < coeffs_len; x += 4) + CALCULATE(0); + + coeffs_len = coeffs[k+1].len; + for ( ; x < coeffs_len; x += 4) + CALCULATE(1); + + /* separate left and right, (and multiply by 2.0) */ +#define SEPARATE(z) \ +do { \ + l_re[z] = _mm_add_ps(a_re[z], b_re[z]); \ + l_im[z] = _mm_sub_ps(a_im[z], b_im[z]); \ + r_re[z] = _mm_add_ps(b_im[z], a_im[z]); \ + r_im[z] = _mm_sub_ps(b_re[z], a_re[z]); \ + l_re[z] = _mm_hadd_ps(l_re[z], l_im[z]); \ + r_re[z] = _mm_hadd_ps(r_re[z], r_im[z]); \ + l_re[z] = _mm_hadd_ps(l_re[z], r_re[z]); \ + result[z] = _mm_mul_ps(l_re[z], l_re[z]); \ +} while (0) + SEPARATE(0); + SEPARATE(1); +#undef SEPARATE + _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1])); + } +} +#endif + +#undef CALCULATE + +#if HAVE_AVX_INTRINSIC +static av_intrinsic_avx +void cqt_calc_avx(FFTComplex *dst, const FFTComplex *src, const Coeffs *coeffs, + int len, int fft_len) +{ + int k, x, i, j, coeffs_len; + __m128 result[2]; + __m256 l_re[2], l_im[2]; + __m256 r_re[2], r_im[2]; + __m256 a_re[2], a_im[2]; + __m256 b_re[2], b_im[2]; + __m256 m, n; + __m256 m_re, m_im; + __m256 n_re, n_im; + __m256 u; + +#define CALCULATE(z) \ +do { \ + u = _mm256_load_ps(coeffs[k+z].val + x); \ + i = coeffs[k+z].start + x; \ + j = fft_len - i; \ + m = _mm256_load_ps(&src[i].re); \ + n = _mm256_load_ps(&src[i+4].re); \ + m_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(2,0,2,0)); \ + m_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(3,1,3,1)); \ + m = _mm256_loadu_ps(&src[j-3].re); \ + m = _mm256_permute2f128_ps(m, m, _MM_SHUFFLE2(0, 1)); \ + n = _mm256_loadu_ps(&src[j-7].re); \ + n = _mm256_permute2f128_ps(n, n, _MM_SHUFFLE2(0, 1)); \ + n_re = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(0,2,0,2)); \ + n_im = _mm256_shuffle_ps(m, n, _MM_SHUFFLE(1,3,1,3)); \ + a_re[z] = _mm256_add_ps(a_re[z], _mm256_mul_ps(u, m_re)); \ + a_im[z] = _mm256_add_ps(a_im[z], _mm256_mul_ps(u, m_im)); \ + b_re[z] = _mm256_add_ps(b_re[z], _mm256_mul_ps(u, n_re)); \ + b_im[z] = _mm256_add_ps(b_im[z], _mm256_mul_ps(u, n_im)); \ +} while (0) + + for (k = 0; k < len; k += 2) { + a_re[0] = a_re[1] = a_im[0] = a_im[1] = _mm256_setzero_ps(); + b_re[0] = b_re[1] = b_im[0] = b_im[1] = _mm256_setzero_ps(); + + coeffs_len = FFMIN(coeffs[k].len, coeffs[k+1].len); + for (x = 0; x < coeffs_len; x += 8) { + CALCULATE(0); + CALCULATE(1); + } + + coeffs_len = coeffs[k].len; + for ( ; x < coeffs_len; x += 8) + CALCULATE(0); + + coeffs_len = coeffs[k+1].len; + for ( ; x < coeffs_len; x += 8) + CALCULATE(1); + + /* separate left and right, (and multiply by 2.0) */ +#define SEPARATE(z) \ +do { \ + l_re[z] = _mm256_add_ps(a_re[z], b_re[z]); \ + l_im[z] = _mm256_sub_ps(a_im[z], b_im[z]); \ + r_re[z] = _mm256_add_ps(b_im[z], a_im[z]); \ + r_im[z] = _mm256_sub_ps(b_re[z], a_re[z]); \ + l_re[z] = _mm256_hadd_ps(l_re[z], l_im[z]); \ + r_re[z] = _mm256_hadd_ps(r_re[z], r_im[z]); \ + l_re[z] = _mm256_hadd_ps(l_re[z], r_re[z]); \ + result[z] = _mm_add_ps(_mm256_castps256_ps128(l_re[z]), \ + _mm256_castps256_ps128(_mm256_permute2f128_ps(l_re[z], l_re[z], _MM_SHUFFLE2(0, 1)))); \ + result[z] = _mm_mul_ps(result[z], result[z]); \ +} while (0) + SEPARATE(0); + SEPARATE(1); + _mm_store_ps(&dst[k].re, _mm_hadd_ps(result[0], result[1])); + } +#undef CALCULATE +#undef SEPARATE +} + +static void permute_coeffs_avx(float *v, int len) +{ + int k; + for (k = 0; k < len; k += 8) { + FFSWAP(float, v[k+2], v[k+4]); + FFSWAP(float, v[k+3], v[k+5]); + } +} +#endif + +av_cold void ff_showcqt_init_x86(ShowCQTContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + +#if HAVE_SSE_INTRINSIC + if (cpu_flags & AV_CPU_FLAG_SSE) { + s->cqt_calc = cqt_calc_sse; + s->permute_coeffs = NULL; + s->cqt_align = 4; + } +#endif + +#if HAVE_SSE3_INTRINSIC + if (cpu_flags & AV_CPU_FLAG_SSE3 && !(cpu_flags & AV_CPU_FLAG_SSE3SLOW)) { + s->cqt_calc = cqt_calc_sse3; + s->permute_coeffs = NULL; + s->cqt_align = 4; + } +#endif + +#if HAVE_AVX_INTRINSIC + if (cpu_flags & AV_CPU_FLAG_AVX && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) { + s->cqt_calc = cqt_calc_avx; + s->permute_coeffs = permute_coeffs_avx; + s->cqt_align = 8; + } +#endif +} -- 2.5.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel