On Wed, Sep 03, 2014 at 05:50:32PM +0200, Pascal Massimino wrote: [...] > removed this step in both mmx and sse2 version. > > -> new patch attached. > > /skal
> From d2249b05b4a881ec3c9de8fc105b2a40c680a0ea Mon Sep 17 00:00:00 2001 > From: skal <pascal.massim...@gmail.com> > Date: Wed, 3 Sep 2014 11:02:32 +0200 > Subject: [PATCH] MMX/MMXEXT/SSE2 implementation of idet's filter_line() > > integration by Neil Birkbeck, with help from Vitor Sessak. > core SSE2 loop by Skal (pascal.massim...@gmail.com) > --- > MAINTAINERS | 1 + > libavfilter/vf_idet.c | 38 ++---------- > libavfilter/vf_idet.h | 58 ++++++++++++++++++ > libavfilter/x86/Makefile | 2 + > libavfilter/x86/vf_idet.asm | 134 > +++++++++++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_idet_init.c | 71 ++++++++++++++++++++++ > 6 files changed, 272 insertions(+), 32 deletions(-) > create mode 100644 libavfilter/vf_idet.h > create mode 100644 libavfilter/x86/vf_idet.asm > create mode 100644 libavfilter/x86/vf_idet_init.c > [...] > @@ -77,7 +48,7 @@ static const char *type2str(Type type) > return NULL; > } > > -static int filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t > *c, int w) > +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t > *c, int w) why do you need to export it? > { > int x; > int ret=0; > @@ -271,7 +242,10 @@ static av_cold int init(AVFilterContext *ctx) > idet->last_type = UNDETERMINED; > memset(idet->history, UNDETERMINED, HIST_SIZE); > > - idet->filter_line = filter_line_c; > + idet->filter_line = ff_idet_filter_line_c; > + > + if (ARCH_X86) > + ff_idet_init_x86(idet); wrong indent [...] > diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm > new file mode 100644 > index 0000000..3fb265c > --- /dev/null > +++ b/libavfilter/x86/vf_idet.asm > @@ -0,0 +1,134 @@ > +;; > ***************************************************************************** > +;; * x86-optimized functions for idet filter > +;; * > +;; * This file is part of FFmpeg. > +;; * > +;; * FFmpeg is free software; you can redistribute it and/or modify > +;; * it under the terms of the GNU General Public License as published by > +;; * the Free Software Foundation; either version 2 of the License, or > +;; * (at your option) any later version. > +;; * > +;; * FFmpeg is distributed in the hope that it will be useful, > +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of > +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +;; * GNU General Public License for more details. > +;; * > +;; * You should have received a copy of the GNU General Public License along > +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc., > +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > +;; > ****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION .text SECTION_TEXT > + > +; Implementation that does 8-bytes at a time using single-word operations. > +%macro IDET_FILTER_LINE 0 > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index > + xor indexq, indexq > +%define m_zero m7 > +%define m_sum m6 > + pxor m_sum, m_sum > + pxor m_zero, m_zero > + > +.loop: > + movq m0, [aq+indexq*1] movu > + movq m1, m0 > + punpcklbw m0, m_zero > + punpckhbw m1, m_zero Here and later, you can do: punpckhbw m1, m0, m_zero punpcklbw m0, m_zero > + > + movq m3, [cq+indexq*1] movu > + movq m4, m3 mova etc > + punpcklbw m3, m_zero > + punpckhbw m4, m_zero > + > + paddsw m0, m3 > + paddsw m1, m4 > + > + movq m3, [bq+indexq*1] > + movq m4, m3 > + punpcklbw m3, m_zero > + punpckhbw m4, m_zero > + > + paddw m3, m3 > + paddw m4, m4 > + psubsw m0, m3 > + psubsw m1, m4 > + > + ABS1 m0, m5 > + ABS1 m1, m5 ABS2? > + paddw m0, m1 > + movq m1, m0 > + punpcklwd m0, m_zero > + punpckhwd m1, m_zero > + paddd m0, m1 > + paddd m_sum, m0 > + > + add indexq, 0x8 > + CMP widthq, indexq Someone needs to confirm this, but I think you'll need to make width a ptrdiff_t and not an int Also... stupid question but what's CMP? > + jg .loop > + > + movq m0, m_sum > + psrlq m_sum, 0x20 > + paddq m0, m_sum > + movd eax, m0 > + RET > +%endmacro > + > +%if ARCH_X86_32 > +INIT_MMX mmxext > +IDET_FILTER_LINE > + > +INIT_MMX mmx > +IDET_FILTER_LINE > +%endif > + > +;; SSE2 8-bit implementation that does 16-bytes at a time: > +;; > +;; const int w2 = w >> 4 > +;; for (int i = 0; i < w2, ++i) { > +;; const __m128i A = _mm_loadu_si128(&p_a[i]) > +;; const __m128i B = _mm_loadu_si128(&p_b[i]) > +;; const __m128i C = _mm_loadu_si128(&p_c[i]) > +;; const __m128i ab = _mm_subs_epu8(A, B) > +;; const __m128i ba = _mm_subs_epu8(B, A) > +;; const __m128i bc = _mm_subs_epu8(B, C) > +;; const __m128i cb = _mm_subs_epu8(C, B) > +;; const __m128i s1 = _mm_sad_epu8(ab, bc) > +;; const __m128i s2 = _mm_sad_epu8(ba, cb) > +;; result1 = _mm_add_epi64(result1, s1) > +;; result2 = _mm_add_epi64(result2, s2) > +;; } please drop this > +INIT_XMM sse2 > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total > + xor indexq, indexq > + pxor m0, m0 > + pxor m1, m1 > + > +.sse2_loop: > + movdqu m2, [bq+indexq*1] ; B > + movdqu m3, [aq+indexq*1] ; A movu > + movdqa m5, m2 > + movdqa m6, m2 > + movdqa m4, m3 mova etc > + psubusb m5, m3 ; ba psubusb m5, m2, m3 and drop the previous mov > + > + movdqu m3, [cq+indexq*1] ; C > + add indexq, 0x10 > + psubusb m4, m2 ; ab > + CMP indexq, widthq > + > + psubusb m6, m3 ; bc > + psubusb m3, m2 ; cb > + > + psadbw m4, m6 ; |ab - bc| > + paddq m0, m4 > + psadbw m5, m3 ; |ba - cb| > + paddq m1, m5 > + jl .sse2_loop > + > + paddq m0, m1 > + movhlps m1, m0 > + paddq m0, m1 > + movd eax, m0 > + RET > diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c > new file mode 100644 > index 0000000..98086ef > --- /dev/null > +++ b/libavfilter/x86/vf_idet_init.c > @@ -0,0 +1,71 @@ > +/* > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/mem.h" > +#include "libavutil/x86/asm.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/vf_idet.h" > + > +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ > +#define FUNC_MAIN_DECL(KIND, SPAN) \ > +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ > + const uint8_t *c, int w); \ > +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ > + const uint8_t *c, int w) { \ > + int sum = 0; \ > + const int left_over = w % SPAN; \ > + w -= left_over; \ > + if (w > 0) \ > + sum += ff_idet_filter_line_##KIND(a, b, c, w); \ > + if (left_over > 0) \ > + sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); \ > + return sum; \ > +} wrong indent, 4 spaces [...] -- Clément B.
pgpcvgcInmiQB.pgp
Description: PGP signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel