On Wed, Sep 03, 2014 at 07:05:48PM +0200, Pascal Massimino wrote: [...] > > > + punpcklbw m3, m_zero > > > + punpckhbw m4, m_zero > > > + > > > + paddsw m0, m3 > > > + paddsw m1, m4 > > > + > > > + movq m3, [bq+indexq*1] > > > + movq m4, m3 > > > + punpcklbw m3, m_zero > > > + punpckhbw m4, m_zero > > > + > > > + paddw m3, m3 > > > + paddw m4, m4 > > > + psubsw m0, m3 > > > + psubsw m1, m4 > > > + > > > > > + ABS1 m0, m5 > > > + ABS1 m1, m5 > > > > ABS2? > > > > ABS2 requires the two tmp registers to be different (can't use m5 for both). >
Aren't m3 and m4 available at that point? (ABS2 has the benefit of doing some pairing so is faster than doing two ABS1) > > > > > > + paddw m0, m1 > > > + movq m1, m0 > > > + punpcklwd m0, m_zero > > > + punpckhwd m1, m_zero > > > + paddd m0, m1 > > > + paddd m_sum, m0 > > > + > > > + add indexq, 0x8 > > > > > + CMP widthq, indexq > > > > Someone needs to confirm this, but I think you'll need to make width a > > ptrdiff_t and not an int > > > > changed to widthd/indexd, that's enough. > Hopefully... > > > > > Also... stupid question but what's CMP? > > > > it's equivalent to 'cmp DWORD' here iirc. > I believe you can keep it lowercase. I thought it was a macro but didn't see anything like this. > > [...] > diff --git a/libavfilter/x86/vf_idet.asm b/libavfilter/x86/vf_idet.asm > new file mode 100644 > index 0000000..19b7f3b > --- /dev/null > +++ b/libavfilter/x86/vf_idet.asm > @@ -0,0 +1,116 @@ > +;; > ***************************************************************************** > +;; * x86-optimized functions for idet filter > +;; * > +;; * This file is part of FFmpeg. > +;; * > +;; * FFmpeg is free software; you can redistribute it and/or modify > +;; * it under the terms of the GNU General Public License as published by > +;; * the Free Software Foundation; either version 2 of the License, or > +;; * (at your option) any later version. > +;; * > +;; * FFmpeg is distributed in the hope that it will be useful, > +;; * but WITHOUT ANY WARRANTY; without even the implied warranty of > +;; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > +;; * GNU General Public License for more details. > +;; * > +;; * You should have received a copy of the GNU General Public License along > +;; * with FFmpeg; if not, write to the Free Software Foundation, Inc., > +;; * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > +;; > ****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_TEXT > + > +; Implementation that does 8-bytes at a time using single-word operations. > +%macro IDET_FILTER_LINE 0 > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index Shouldn't this be 4,5,8? 4 args, 5 regs (4 args + 1 local var), 8 xmm (m0..m7) And BTW, you don't seem to use m2, so you can have 4,5,7 ; you probably want to define m_zero on m2 to avoid any confusion. Now, this macro is only used for MMX, so you can probably do 4,5,0 (no XMM reg). If you do 4,5,0, please move the INIT_MMX into the macro: %macro IDET_FILTER_LINE_MMX 1 INIT_MMX %1 ... %end macro ... INIT_MMX mmx INIT_MMX mmxext > + xor indexq, indexq > +%define m_zero m7 > +%define m_sum m6 > + pxor m_sum, m_sum > + pxor m_zero, m_zero > + > +.loop: > + movu m0, [aq+indexq*1] > + punpckhbw m1, m0, m_zero > + punpcklbw m0, m_zero > + > + movu m3, [cq+indexq*1] > + punpckhbw m4, m3, m_zero > + punpcklbw m3, m_zero > + > + paddsw m1, m4 > + paddsw m0, m3 > + > + movu m3, [bq+indexq*1] > + punpckhbw m4, m3, m_zero > + punpcklbw m3, m_zero > + > + paddw m4, m4 > + paddw m3, m3 > + psubsw m1, m4 > + psubsw m0, m3 > + > + ABS1 m1, m5 > + ABS1 m0, m5 > + > + paddw m0, m1 > + punpckhwd m1, m0, m_zero > + punpcklwd m0, m_zero > + > + paddd m0, m1 > + paddd m_sum, m0 > + > + add indexq, 0x8 > + CMP widthd, indexd > + jg .loop > + > + mova m0, m_sum > + psrlq m_sum, 0x20 > + paddq m0, m_sum > + movd eax, m0 > + RET > +%endmacro > + > +%if ARCH_X86_32 > +INIT_MMX mmxext > +IDET_FILTER_LINE > + > +INIT_MMX mmx > +IDET_FILTER_LINE > +%endif > + > +;; SSE2 8-bit implementation that does 16-bytes at a time: > +INIT_XMM sse2 > +cglobal idet_filter_line, 4, 8, 6, a, b, c, width, index, total 4,6,7, AFAICT > + xor indexq, indexq > + pxor m0, m0 > + pxor m1, m1 > + > +.sse2_loop: > + movu m2, [bq+indexq*1] ; B > + movu m3, [aq+indexq*1] ; A > + mova m6, m2 > + mova m4, m3 > + psubusb m5, m2, m3 ; ba > + > + movu m3, [cq+indexq*1] ; C > + add indexq, 0x10 > + psubusb m4, m2 ; ab > + CMP indexd, widthd > + > + psubusb m6, m3 ; bc > + psubusb m3, m2 ; cb > + > + psadbw m4, m6 ; |ab - bc| > + paddq m0, m4 > + psadbw m5, m3 ; |ba - cb| > + paddq m1, m5 > + jl .sse2_loop > + > + paddq m0, m1 > + movhlps m1, m0 > + paddq m0, m1 > + movd eax, m0 > + RET [...] -- Clément B.
pgpsYjHLrF3Sz.pgp
Description: PGP signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel