Hi, Much faster. An example (time ffmpeg -i ... -vf idet -f null /dev/null) Raw C: user 25.007s MMX: user 16.818s MMXEXT: user 16.191s SSE2: user 15.481s no idet filter: user 15.025s
YMMV. skal
commit de5064a0126a0a5bed9d8f151fa79b07614729fe Author: skal <pascal.massim...@gmail.com> Date: Tue Sep 2 16:55:29 2014 +0200 MMX/MMXEXT/SSE2 implementation of idet's filter_line() integration by Neil Birkbeck, with help from Vitor Sessak. core SSE2 loop by Skal (pascal.massim...@gmail.com) diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c index d441a5f..83f5463 100644 --- a/libavfilter/vf_idet.c +++ b/libavfilter/vf_idet.c @@ -23,37 +23,8 @@ #include "libavutil/cpu.h" #include "libavutil/common.h" #include "libavutil/opt.h" -#include "libavutil/pixdesc.h" -#include "avfilter.h" #include "internal.h" - -#define HIST_SIZE 4 - -typedef enum { - TFF, - BFF, - PROGRSSIVE, - UNDETERMINED, -} Type; - -typedef struct { - const AVClass *class; - float interlace_threshold; - float progressive_threshold; - - Type last_type; - int prestat[4]; - int poststat[4]; - - uint8_t history[HIST_SIZE]; - - AVFrame *cur; - AVFrame *next; - AVFrame *prev; - int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); - - const AVPixFmtDescriptor *csp; -} IDETContext; +#include "vf_idet.h" #define OFFSET(x) offsetof(IDETContext, x) #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM @@ -273,6 +244,9 @@ static av_cold int init(AVFilterContext *ctx) idet->filter_line = filter_line_c; + if (ARCH_X86) + ff_idet_init_x86(idet); + return 0; } diff --git a/libavfilter/vf_idet.h b/libavfilter/vf_idet.h new file mode 100644 index 0000000..f6f0d49 --- /dev/null +++ b/libavfilter/vf_idet.h @@ -0,0 +1,55 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_IDET_H +#define AVFILTER_IDET_H + +#include "libavutil/pixdesc.h" +#include "avfilter.h" + +#define HIST_SIZE 4 + +typedef enum { + TFF, + BFF, + PROGRSSIVE, + UNDETERMINED, +} Type; + +typedef struct { + const AVClass *class; + float interlace_threshold; + float progressive_threshold; + + Type last_type; + int prestat[4]; + int poststat[4]; + + uint8_t history[HIST_SIZE]; + + AVFrame *cur; + AVFrame *next; + AVFrame *prev; + int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); + + const AVPixFmtDescriptor *csp; +} IDETContext; + +void ff_idet_init_x86(IDETContext *idet); + +#endif diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 6a252b4..ddb3774 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,5 +1,6 @@ OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o +OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o @@ -7,6 +8,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o +YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm new file mode 100644 index 0000000..60c3382 --- /dev/null +++ b/libavfilter/x86/vf_idet.asm @@ -0,0 +1,193 @@ +;; ***************************************************************************** +;; * x86-optimized functions for idet filter +;; * +;; * This file is part of FFmpeg. +;; * +;; * FFmpeg is free software; you can redistribute it and/or modify +;; * it under the terms of the GNU General Public License as published by +;; * the Free Software Foundation; either version 2 of the License, or +;; * (at your option) any later version. +;; * +;; * FFmpeg is distributed in the hope that it will be useful, +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; * GNU General Public License for more details. +;; * +;; * You should have received a copy of the GNU General Public License along +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc., +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;; ****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +%macro DECLARE_VAR 2 + %define %1 %2 + %define %1d %2d +%endmacro + +;; Mappings for common variables +DECLARE_VAR index, r5 +DECLARE_VAR total, r4 +DECLARE_VAR parallel_width, r6 +DECLARE_VAR tmp1, r7 +DECLARE_VAR tmp2, r6 + +;; Non-simd loop over the range of the current value of index to width. +%macro LEFT_OVER_LOOP 0 +.left_over_loop: + movzx tmp2d, byte[aq+index] + movzx tmp1d, byte[cq+index] + add tmp1d, tmp2d + movzx tmp2d, byte[bq+index] + add index, 0x1 + neg tmp2d + lea tmp1d, [tmp1+tmp2*2] ;; tmp1 = a + c - 2 * b + mov tmp2d, tmp1d + sar tmp2d, 0x1f + xor tmp1d, tmp2d + sub tmp1d, tmp2d + movsxd tmp1, tmp1d + add total, tmp1 ;; total += abs(a + c - 2 * b) + CMP indexd, widthd + jne .left_over_loop +%endmacro + +;; Implementation that does 8-bytes at a time using single-word operations. +;; Can be instantiated with different implementations of ABS1 +%macro IDET_FILTER_LINE 0 +cglobal idet_filter_line, 4, 8, 8, a, b, c, width + mov parallel_widthd, widthd + xor index, index + and parallel_width, 0xfffffff8 + jle .handle_last_few_pixels + %define m_zero m7 + %define m_sum m6 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop: + movq m0, [aq+index*1] + movq m1, m0 + punpcklbw m0, m_zero + punpckhbw m1, m_zero + + movq m3, [cq+index*1] + movq m4, m3 + punpcklbw m3, m_zero + punpckhbw m4, m_zero + + paddsw m0, m3 + paddsw m1, m4 + + movq m3, [bq+index*1] + movq m4, m3 + punpcklbw m3, m_zero + punpckhbw m4, m_zero + + psllw m3, 0x1 + psllw m4, 0x1 + psubsw m0, m3 + psubsw m1, m4 + + ABS1 m0, m5 + ABS1 m1, m5 + paddw m0, m1 + movq m1, m0 + punpcklwd m0, m_zero + punpckhwd m1, m_zero + paddd m0, m1 + paddd m_sum, m0 + + add index, 0x8 + CMP parallel_width, index + jg .loop + +.handle_last_few_pixels: + movd totald, m_sum + psrlq m_sum, 0x20 + movd tmp1d, m_sum + add totald, tmp1d + CMP DWORD widthd, parallel_widthd + jle .end + LEFT_OVER_LOOP + +.end: + mov eax, totald + RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmxext +IDET_FILTER_LINE +%endif + +INIT_MMX mmx +IDET_FILTER_LINE + +;; SSE2 8-bit implementation that does 16-bytes at a time: +;; +;; const int w2 = w >> 4 +;; for (int i = 0; i < w2, ++i) { +;; const __m128i A = _mm_loadu_si128(&p_a[i]) +;; const __m128i B = _mm_loadu_si128(&p_b[i]) +;; const __m128i C = _mm_loadu_si128(&p_c[i]) +;; const __m128i ab = _mm_subs_epu8(A, B) +;; const __m128i ba = _mm_subs_epu8(B, A) +;; const __m128i bc = _mm_subs_epu8(B, C) +;; const __m128i cb = _mm_subs_epu8(C, B) +;; const __m128i s1 = _mm_sad_epu8(ab, bc) +;; const __m128i s2 = _mm_sad_epu8(ba, cb) +;; result1 = _mm_add_epi64(result1, s1) +;; result2 = _mm_add_epi64(result2, s2) +;; } +INIT_XMM sse2 +cglobal idet_filter_line, 4, 8, 8, 0x10, a, b, c, width + xor index, index + pxor m0, m0 + movdqa m1, m0 + + mov parallel_widthd, widthd + sar parallel_widthd, 0x4 + test parallel_widthd, parallel_widthd + jle .sse2_cleanup + + shl parallel_width, 0x4 + +.sse2_loop: + movdqu m2, [bq+index*1] ; B + movdqu m3, [aq+index*1] ; A + movdqa m5, m2 + movdqa m6, m2 + movdqa m4, m3 + psubusb m5, m3 ; ba + + movdqu m3, [cq+index*1] ; C + add index, 0x10 + psubusb m4, m2 ; ab + CMP index, parallel_width + + psubusb m6, m3 ; bc + psubusb m3, m2 ; cb + + psadbw m4, m6 ; |ab - bc| + paddq m0, m4 + psadbw m5, m3 ; |ba - cb| + paddq m1, m5 + jne .sse2_loop + +.sse2_cleanup: + paddq m0, m1 + movhlps m1, m0 + paddq m0, m1 + movd total, m0 + + CMP DWORD widthd, parallel_widthd + jle .sse2_end + + LEFT_OVER_LOOP + +.sse2_end: + mov eax, totald + RET diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c new file mode 100644 index 0000000..9a03989 --- /dev/null +++ b/libavfilter/x86/vf_idet_init.c @@ -0,0 +1,53 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_idet.h" + +int ff_idet_filter_line_mmx(const uint8_t *a, const uint8_t *b, + const uint8_t *c, int w); +int ff_idet_filter_line_mmxext(const uint8_t *a, const uint8_t *b, + const uint8_t *c, int w); +int ff_idet_filter_line_sse2(const uint8_t *a, const uint8_t *b, + const uint8_t *c, int w); + +av_cold void ff_idet_init_x86(IDETContext *idet) +{ +#if HAVE_YASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + idet->filter_line = ff_idet_filter_line_mmx; + } + +#if ARCH_X86_32 + if (EXTERNAL_MMXEXT(cpu_flags)) { + idet->filter_line = ff_idet_filter_line_mmxext; + } +#endif // ARCH_x86_32 + + if (EXTERNAL_SSE2(cpu_flags)) { + idet->filter_line = ff_idet_filter_line_sse2; + } + +#endif // HAVE_YASM +}
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel