Clément, On Wed, Sep 3, 2014 at 6:19 PM, Clément Bœsch <u...@pkh.me> wrote:
> On Wed, Sep 03, 2014 at 05:50:32PM +0200, Pascal Massimino wrote: > [...] > > removed this step in both mmx and sse2 version. > > > > -> new patch attached. > > > > /skal > > > From d2249b05b4a881ec3c9de8fc105b2a40c680a0ea Mon Sep 17 00:00:00 2001 > > From: skal <pascal.massim...@gmail.com> > > Date: Wed, 3 Sep 2014 11:02:32 +0200 > > Subject: [PATCH] MMX/MMXEXT/SSE2 implementation of idet's filter_line() > > > > integration by Neil Birkbeck, with help from Vitor Sessak. > > core SSE2 loop by Skal (pascal.massim...@gmail.com) > > --- > > MAINTAINERS | 1 + > > libavfilter/vf_idet.c | 38 ++---------- > > libavfilter/vf_idet.h | 58 ++++++++++++++++++ > > libavfilter/x86/Makefile | 2 + > > libavfilter/x86/vf_idet.asm | 134 > +++++++++++++++++++++++++++++++++++++++++ > > libavfilter/x86/vf_idet_init.c | 71 ++++++++++++++++++++++ > > 6 files changed, 272 insertions(+), 32 deletions(-) > > create mode 100644 libavfilter/vf_idet.h > > create mode 100644 libavfilter/x86/vf_idet.asm > > create mode 100644 libavfilter/x86/vf_idet_init.c > > > [...] > > @@ -77,7 +48,7 @@ static const char *type2str(Type type) > > return NULL; > > } > > > > -static int filter_line_c(const uint8_t *a, const uint8_t *b, const > uint8_t *c, int w) > > +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const > uint8_t *c, int w) > > why do you need to export it? > it's used to finish the left-overs in mmx/mmxext/sse2. See FUNC_MAIN_DECL macro. > > > { > > int x; > > int ret=0; > > @@ -271,7 +242,10 @@ static av_cold int init(AVFilterContext *ctx) > > idet->last_type = UNDETERMINED; > > memset(idet->history, UNDETERMINED, HIST_SIZE); > > > > - idet->filter_line = filter_line_c; > > + idet->filter_line = ff_idet_filter_line_c; > > + > > + if (ARCH_X86) > > + ff_idet_init_x86(idet); > > wrong indent > > fixed > [...] > > diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm > > new file mode 100644 > > index 0000000..3fb265c > > --- /dev/null > > +++ b/libavfilter/x86/vf_idet.asm > > @@ -0,0 +1,134 @@ > > +;; > ***************************************************************************** > > +;; * x86-optimized functions for idet filter > > +;; * > > +;; * This file is part of FFmpeg. > > +;; * > > +;; * FFmpeg is free software; you can redistribute it and/or modify > > +;; * it under the terms of the GNU General Public License as published > by > > +;; * the Free Software Foundation; either version 2 of the License, or > > +;; * (at your option) any later version. > > +;; * > > +;; * FFmpeg is distributed in the hope that it will be useful, > > +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of > > +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > +;; * GNU General Public License for more details. > > +;; * > > +;; * You should have received a copy of the GNU General Public License > along > > +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc., > > +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > > +;; > ****************************************************************************** > > + > > +%include "libavutil/x86/x86util.asm" > > + > > > +SECTION .text > > SECTION_TEXT > fixed > > > + > > +; Implementation that does 8-bytes at a time using single-word > operations. > > +%macro IDET_FILTER_LINE 0 > > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index > > + xor indexq, indexq > > +%define m_zero m7 > > +%define m_sum m6 > > + pxor m_sum, m_sum > > + pxor m_zero, m_zero > > + > > +.loop: > > > + movq m0, [aq+indexq*1] > > movu > > > + movq m1, m0 > > + punpcklbw m0, m_zero > > + punpckhbw m1, m_zero > > Here and later, you can do: > > punpckhbw m1, m0, m_zero > punpcklbw m0, m_zero > done > > > + > > + movq m3, [cq+indexq*1] > > movu > > > + movq m4, m3 > > mova > > etc > > > + punpcklbw m3, m_zero > > + punpckhbw m4, m_zero > > + > > + paddsw m0, m3 > > + paddsw m1, m4 > > + > > + movq m3, [bq+indexq*1] > > + movq m4, m3 > > + punpcklbw m3, m_zero > > + punpckhbw m4, m_zero > > + > > + paddw m3, m3 > > + paddw m4, m4 > > + psubsw m0, m3 > > + psubsw m1, m4 > > + > > > + ABS1 m0, m5 > > + ABS1 m1, m5 > > ABS2? > ABS2 requires the two tmp registers to be different (can't use m5 for both). > > > + paddw m0, m1 > > + movq m1, m0 > > + punpcklwd m0, m_zero > > + punpckhwd m1, m_zero > > + paddd m0, m1 > > + paddd m_sum, m0 > > + > > + add indexq, 0x8 > > > + CMP widthq, indexq > > Someone needs to confirm this, but I think you'll need to make width a > ptrdiff_t and not an int > changed to widthd/indexd, that's enough. > > Also... stupid question but what's CMP? > it's equivalent to 'cmp DWORD' here iirc. > > + jg .loop > > + > > + movq m0, m_sum > > + psrlq m_sum, 0x20 > > + paddq m0, m_sum > > + movd eax, m0 > > + RET > > +%endmacro > > + > > +%if ARCH_X86_32 > > +INIT_MMX mmxext > > +IDET_FILTER_LINE > > + > > +INIT_MMX mmx > > +IDET_FILTER_LINE > > +%endif > > + > > +;; SSE2 8-bit implementation that does 16-bytes at a time: > > +;; > > > +;; const int w2 = w >> 4 > > +;; for (int i = 0; i < w2, ++i) { > > +;; const __m128i A = _mm_loadu_si128(&p_a[i]) > > +;; const __m128i B = _mm_loadu_si128(&p_b[i]) > > +;; const __m128i C = _mm_loadu_si128(&p_c[i]) > > +;; const __m128i ab = _mm_subs_epu8(A, B) > > +;; const __m128i ba = _mm_subs_epu8(B, A) > > +;; const __m128i bc = _mm_subs_epu8(B, C) > > +;; const __m128i cb = _mm_subs_epu8(C, B) > > +;; const __m128i s1 = _mm_sad_epu8(ab, bc) > > +;; const __m128i s2 = _mm_sad_epu8(ba, cb) > > +;; result1 = _mm_add_epi64(result1, s1) > > +;; result2 = _mm_add_epi64(result2, s2) > > +;; } > > please drop this > > done > > +INIT_XMM sse2 > > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total > > + xor indexq, indexq > > + pxor m0, m0 > > + pxor m1, m1 > > + > > +.sse2_loop: > > + movdqu m2, [bq+indexq*1] ; B > > + movdqu m3, [aq+indexq*1] ; A > > movu > done > > > + movdqa m5, m2 > > + movdqa m6, m2 > > + movdqa m4, m3 > > mova > > done > etc > > > + psubusb m5, m3 ; ba > > psubusb m5, m2, m3 and drop the previous mov > done > > > + > > + movdqu m3, [cq+indexq*1] ; C > > + add indexq, 0x10 > > + psubusb m4, m2 ; ab > > + CMP indexq, widthq > > + > > + psubusb m6, m3 ; bc > > + psubusb m3, m2 ; cb > > + > > + psadbw m4, m6 ; |ab - bc| > > + paddq m0, m4 > > + psadbw m5, m3 ; |ba - cb| > > + paddq m1, m5 > > + jl .sse2_loop > > + > > + paddq m0, m1 > > + movhlps m1, m0 > > + paddq m0, m1 > > + movd eax, m0 > > + RET > > diff --git a/libavfilter/x86/vf_idet_init.c > b/libavfilter/x86/vf_idet_init.c > > new file mode 100644 > > index 0000000..98086ef > > --- /dev/null > > +++ b/libavfilter/x86/vf_idet_init.c > > @@ -0,0 +1,71 @@ > > +/* > > + * This file is part of FFmpeg. > > + * > > + * FFmpeg is free software; you can redistribute it and/or > > + * modify it under the terms of the GNU General Public > > + * License as published by the Free Software Foundation; either > > + * version 2 of the License, or (at your option) any later version. > > + * > > + * FFmpeg is distributed in the hope that it will be useful, > > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > + * GNU General Public License for more details. > > + * > > + * You should have received a copy of the GNU General Public License > along > > + * with FFmpeg; if not, write to the Free Software > > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > + */ > > + > > +#include "libavutil/attributes.h" > > +#include "libavutil/cpu.h" > > +#include "libavutil/mem.h" > > +#include "libavutil/x86/asm.h" > > +#include "libavutil/x86/cpu.h" > > +#include "libavfilter/vf_idet.h" > > + > > +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ > > +#define FUNC_MAIN_DECL(KIND, SPAN) > \ > > +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, > \ > > + const uint8_t *c, int w); > \ > > +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, > \ > > + const uint8_t *c, int w) { > \ > > + int sum = 0; > \ > > + const int left_over = w % SPAN; > \ > > + w -= left_over; > \ > > + if (w > 0) > \ > > + sum += ff_idet_filter_line_##KIND(a, b, c, w); > \ > > + if (left_over > 0) > \ > > + sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); > \ > > + return sum; > \ > > +} > > wrong indent, 4 spaces > done eveywhere. thanks! new patch attached. /skal > > [...] > > -- > Clément B. >
From 9b6df4a6ccf98ed5f3f064d0aff3171f94a7f907 Mon Sep 17 00:00:00 2001 From: skal <pascal.massim...@gmail.com> Date: Wed, 3 Sep 2014 11:02:32 +0200 Subject: [PATCH] MMX/MMXEXT/SSE2 implementation of idet's filter_line() integration by Neil Birkbeck, with help from Vitor Sessak. core SSE2 loop by Skal (pascal.massim...@gmail.com) --- MAINTAINERS | 1 + libavfilter/vf_idet.c | 38 +++----------- libavfilter/vf_idet.h | 58 +++++++++++++++++++++ libavfilter/x86/Makefile | 2 + libavfilter/x86/vf_idet.asm | 116 +++++++++++++++++++++++++++++++++++++++++ libavfilter/x86/vf_idet_init.c | 70 +++++++++++++++++++++++++ 6 files changed, 253 insertions(+), 32 deletions(-) create mode 100644 libavfilter/vf_idet.h create mode 100644 libavfilter/x86/vf_idet.asm create mode 100644 libavfilter/x86/vf_idet_init.c diff --git a/MAINTAINERS b/MAINTAINERS index b80fe7a..0992ab5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -353,6 +353,7 @@ Filters: vf_extractplanes.c Paul B Mahol vf_histogram.c Paul B Mahol vf_hqx.c Clément Bœsch + vf_idec.c Pascal Massimino vf_il.c Paul B Mahol vf_lenscorrection.c Daniel Oberhoff vf_mergeplanes.c Paul B Mahol diff --git a/libavfilter/vf_idet.c b/libavfilter/vf_idet.c index d441a5f..4416228 100644 --- a/libavfilter/vf_idet.c +++ b/libavfilter/vf_idet.c @@ -23,37 +23,8 @@ #include "libavutil/cpu.h" #include "libavutil/common.h" #include "libavutil/opt.h" -#include "libavutil/pixdesc.h" -#include "avfilter.h" #include "internal.h" - -#define HIST_SIZE 4 - -typedef enum { - TFF, - BFF, - PROGRSSIVE, - UNDETERMINED, -} Type; - -typedef struct { - const AVClass *class; - float interlace_threshold; - float progressive_threshold; - - Type last_type; - int prestat[4]; - int poststat[4]; - - uint8_t history[HIST_SIZE]; - - AVFrame *cur; - AVFrame *next; - AVFrame *prev; - int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); - - const AVPixFmtDescriptor *csp; -} IDETContext; +#include "vf_idet.h" #define OFFSET(x) offsetof(IDETContext, x) #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM @@ -77,7 +48,7 @@ static const char *type2str(Type type) return NULL; } -static int filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w) +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w) { int x; int ret=0; @@ -271,7 +242,10 @@ static av_cold int init(AVFilterContext *ctx) idet->last_type = UNDETERMINED; memset(idet->history, UNDETERMINED, HIST_SIZE); - idet->filter_line = filter_line_c; + idet->filter_line = ff_idet_filter_line_c; + + if (ARCH_X86) + ff_idet_init_x86(idet); return 0; } diff --git a/libavfilter/vf_idet.h b/libavfilter/vf_idet.h new file mode 100644 index 0000000..3820e94 --- /dev/null +++ b/libavfilter/vf_idet.h @@ -0,0 +1,58 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVFILTER_IDET_H +#define AVFILTER_IDET_H + +#include "libavutil/pixdesc.h" +#include "avfilter.h" + +#define HIST_SIZE 4 + +typedef enum { + TFF, + BFF, + PROGRSSIVE, + UNDETERMINED, +} Type; + +typedef struct { + const AVClass *class; + float interlace_threshold; + float progressive_threshold; + + Type last_type; + int prestat[4]; + int poststat[4]; + + uint8_t history[HIST_SIZE]; + + AVFrame *cur; + AVFrame *next; + AVFrame *prev; + int (*filter_line)(const uint8_t *prev, const uint8_t *cur, const uint8_t *next, int w); + + const AVPixFmtDescriptor *csp; +} IDETContext; + +void ff_idet_init_x86(IDETContext *idet); + +/* main fall-back for left-over */ +int ff_idet_filter_line_c(const uint8_t *a, const uint8_t *b, const uint8_t *c, int w); + +#endif diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile index 6a252b4..ddb3774 100644 --- a/libavfilter/x86/Makefile +++ b/libavfilter/x86/Makefile @@ -1,5 +1,6 @@ OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun_init.o OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d_init.o +OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet_init.o OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup_init.o OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o @@ -7,6 +8,7 @@ OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif_init.o YASM-OBJS-$(CONFIG_GRADFUN_FILTER) += x86/vf_gradfun.o YASM-OBJS-$(CONFIG_HQDN3D_FILTER) += x86/vf_hqdn3d.o +YASM-OBJS-$(CONFIG_IDET_FILTER) += x86/vf_idet.o YASM-OBJS-$(CONFIG_PULLUP_FILTER) += x86/vf_pullup.o YASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o YASM-OBJS-$(CONFIG_YADIF_FILTER) += x86/vf_yadif.o x86/yadif-16.o x86/yadif-10.o diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm new file mode 100644 index 0000000..19b7f3b --- /dev/null +++ b/libavfilter/x86/vf_idet.asm @@ -0,0 +1,116 @@ +;; ***************************************************************************** +;; * x86-optimized functions for idet filter +;; * +;; * This file is part of FFmpeg. +;; * +;; * FFmpeg is free software; you can redistribute it and/or modify +;; * it under the terms of the GNU General Public License as published by +;; * the Free Software Foundation; either version 2 of the License, or +;; * (at your option) any later version. +;; * +;; * FFmpeg is distributed in the hope that it will be useful, +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; * GNU General Public License for more details. +;; * +;; * You should have received a copy of the GNU General Public License along +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc., +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +;; ****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_TEXT + +; Implementation that does 8-bytes at a time using single-word operations. +%macro IDET_FILTER_LINE 0 +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index + xor indexq, indexq +%define m_zero m7 +%define m_sum m6 + pxor m_sum, m_sum + pxor m_zero, m_zero + +.loop: + movu m0, [aq+indexq*1] + punpckhbw m1, m0, m_zero + punpcklbw m0, m_zero + + movu m3, [cq+indexq*1] + punpckhbw m4, m3, m_zero + punpcklbw m3, m_zero + + paddsw m1, m4 + paddsw m0, m3 + + movu m3, [bq+indexq*1] + punpckhbw m4, m3, m_zero + punpcklbw m3, m_zero + + paddw m4, m4 + paddw m3, m3 + psubsw m1, m4 + psubsw m0, m3 + + ABS1 m1, m5 + ABS1 m0, m5 + + paddw m0, m1 + punpckhwd m1, m0, m_zero + punpcklwd m0, m_zero + + paddd m0, m1 + paddd m_sum, m0 + + add indexq, 0x8 + CMP widthd, indexd + jg .loop + + mova m0, m_sum + psrlq m_sum, 0x20 + paddq m0, m_sum + movd eax, m0 + RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmxext +IDET_FILTER_LINE + +INIT_MMX mmx +IDET_FILTER_LINE +%endif + +;; SSE2 8-bit implementation that does 16-bytes at a time: +INIT_XMM sse2 +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total + xor indexq, indexq + pxor m0, m0 + pxor m1, m1 + +.sse2_loop: + movu m2, [bq+indexq*1] ; B + movu m3, [aq+indexq*1] ; A + mova m6, m2 + mova m4, m3 + psubusb m5, m2, m3 ; ba + + movu m3, [cq+indexq*1] ; C + add indexq, 0x10 + psubusb m4, m2 ; ab + CMP indexd, widthd + + psubusb m6, m3 ; bc + psubusb m3, m2 ; cb + + psadbw m4, m6 ; |ab - bc| + paddq m0, m4 + psadbw m5, m3 ; |ba - cb| + paddq m1, m5 + jl .sse2_loop + + paddq m0, m1 + movhlps m1, m0 + paddq m0, m1 + movd eax, m0 + RET diff --git a/libavfilter/x86/vf_idet_init.c b/libavfilter/x86/vf_idet_init.c new file mode 100644 index 0000000..402d504 --- /dev/null +++ b/libavfilter/x86/vf_idet_init.c @@ -0,0 +1,70 @@ +/* + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavutil/mem.h" +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavfilter/vf_idet.h" + +/* declares main callable idet_filter_line_{mmx,mmxext,sse2}() */ +#define FUNC_MAIN_DECL(KIND, SPAN) \ +int ff_idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ + const uint8_t *c, int w); \ +static int idet_filter_line_##KIND(const uint8_t *a, const uint8_t *b, \ + const uint8_t *c, int w) { \ + int sum = 0; \ + const int left_over = w % SPAN; \ + w -= left_over; \ + if (w > 0) \ + sum += ff_idet_filter_line_##KIND(a, b, c, w); \ + if (left_over > 0) \ + sum += ff_idet_filter_line_c(a + w, b + w, c + w, left_over); \ + return sum; \ +} + +#if HAVE_YASM + +FUNC_MAIN_DECL(sse2, 16) +#if ARCH_X86_32 +FUNC_MAIN_DECL(mmx, 8) +FUNC_MAIN_DECL(mmxext, 8) +#endif + +#endif + +av_cold void ff_idet_init_x86(IDETContext *idet) +{ +#if HAVE_YASM + const int cpu_flags = av_get_cpu_flags(); + +#if ARCH_X86_32 + if (EXTERNAL_MMX(cpu_flags)) { + idet->filter_line = idet_filter_line_mmx; + } + if (EXTERNAL_MMXEXT(cpu_flags)) { + idet->filter_line = idet_filter_line_mmxext; + } +#endif // ARCH_x86_32 + + if (EXTERNAL_SSE2(cpu_flags)) { + idet->filter_line = idet_filter_line_sse2; + } +#endif // HAVE_YASM +} -- 2.1.0.rc2.206.gedb03e5
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel