On Sat, Sep 13, 2014 at 10:12:12PM -0300, James Almer wrote: > Also add a missing c->pix_abs[0][0] initialization, and sse2 versions of > sad16_x2, sad16_y2 and sad16_xy2. > Since the _xy2 versions are not bitexact, they are accordingly marked as > approximate. > > Signed-off-by: James Almer <jamr...@gmail.com> > ---
> Not benched. if the author of some code doesnt benchmark his code, how can he know which way it is faster ? what effect each difference has ? ... > > libavcodec/x86/me_cmp.asm | 229 > +++++++++++++++++++++++++++++++++++++++++++ > libavcodec/x86/me_cmp_init.c | 203 +++++++++----------------------------- > 2 files changed, 278 insertions(+), 154 deletions(-) > > diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm > index b0741f3..68dc701 100644 > --- a/libavcodec/x86/me_cmp.asm > +++ b/libavcodec/x86/me_cmp.asm > @@ -23,6 +23,10 @@ > > %include "libavutil/x86/x86util.asm" > > +SECTION_RODATA > + > +cextern pb_1 > + > SECTION .text > > %macro DIFF_PIXELS_1 4 > @@ -465,3 +469,228 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h > INIT_MMX mmx > HF_NOISE 8 > HF_NOISE 16 > + > +;--------------------------------------------------------------------------------------- > +;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int > stride, int h); > +;--------------------------------------------------------------------------------------- > +%macro SAD 1 > +cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h > +%if %1 == mmsize > + shr hd, 1 > +%define STRIDE strideq > +%else > +%define STRIDE 8 > +%endif > + pxor m2, m2 > + > +align 16 > +.loop > + movu m0, [pix2q] > + movu m1, [pix2q+STRIDE] > + psadbw m0, [pix1q] > + psadbw m1, [pix1q+STRIDE] > + paddw m2, m0 > + paddw m2, m1 > +%if %1 == mmsize > + lea pix1q, [pix1q+strideq*2] > + lea pix2q, [pix2q+strideq*2] > +%else > + add pix1q, strideq > + add pix2q, strideq > +%endif > + dec hd > + jg .loop the other loops use jnz, why the difference ? > +%if mmsize == 16 > + movhlps m0, m2 > + paddw m2, m0 > +%endif > + movd eax, m2 > + RET > +%endmacro > + > +INIT_MMX mmxext > +SAD 8 > +SAD 16 > +INIT_XMM sse2 > +SAD 16 > + > +;------------------------------------------------------------------------------------------ > +;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int > stride, int h); > +;------------------------------------------------------------------------------------------ > +%macro SAD_X2 1 > +cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h > +%if %1 == mmsize > + shr hd, 1 > +%define STRIDE strideq > +%else > +%define STRIDE 8 > +%endif > + pxor m0, m0 > + > +align 16 do these improve or reduce the speed ? > +.loop: > + movu m1, [pix2q] > + movu m2, [pix2q+STRIDE] > +%if cpuflag(sse2) > + movu m3, [pix2q+1] > + movu m4, [pix2q+STRIDE+1] > + pavgb m1, m3 > + pavgb m2, m4 > +%else > + pavgb m1, [pix2q+1] > + pavgb m2, [pix2q+STRIDE+1] > +%endif > + psadbw m1, [pix1q] > + psadbw m2, [pix1q+STRIDE] > + paddw m0, m1 > + paddw m0, m2 > +%if %1 == mmsize > + lea pix1q, [pix1q+2*strideq] > + lea pix2q, [pix2q+2*strideq] > +%else > + add pix1q, strideq > + add pix2q, strideq > +%endif > + dec hd dec/inc has some speed penalties on some cpus see 16.2 in http://www.agner.org/optimize/optimizing_assembly.pdf [...] -- Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB If you think the mosad wants you dead since a long time then you are either wrong or dead since a long time.
signature.asc
Description: Digital signature
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel