On 14/09/14 7:12 PM, Michael Niedermayer wrote: > On Sat, Sep 13, 2014 at 10:12:12PM -0300, James Almer wrote: >> Also add a missing c->pix_abs[0][0] initialization, and sse2 versions of >> sad16_x2, sad16_y2 and sad16_xy2. >> Since the _xy2 versions are not bitexact, they are accordingly marked as >> approximate. >> >> Signed-off-by: James Almer <jamr...@gmail.com> >> --- > >> Not benched. > > if the author of some code doesnt benchmark his code, how can he know > which way it is faster ? > what effect each difference has ? ...
I didn't bench because i didn't have the time and assumed it wasn't necessary considering this is a port from inline to yasm with little to no changes to the asm. I'll try to do some quick benchmarks later. > > >> >> libavcodec/x86/me_cmp.asm | 229 >> +++++++++++++++++++++++++++++++++++++++++++ >> libavcodec/x86/me_cmp_init.c | 203 +++++++++----------------------------- >> 2 files changed, 278 insertions(+), 154 deletions(-) >> >> diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm >> index b0741f3..68dc701 100644 >> --- a/libavcodec/x86/me_cmp.asm >> +++ b/libavcodec/x86/me_cmp.asm >> @@ -23,6 +23,10 @@ >> >> %include "libavutil/x86/x86util.asm" >> >> +SECTION_RODATA >> + >> +cextern pb_1 >> + >> SECTION .text >> >> %macro DIFF_PIXELS_1 4 >> @@ -465,3 +469,228 @@ cglobal hf_noise%1, 3,3,0, pix1, lsize, h >> INIT_MMX mmx >> HF_NOISE 8 >> HF_NOISE 16 >> + >> +;--------------------------------------------------------------------------------------- >> +;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int >> stride, int h); >> +;--------------------------------------------------------------------------------------- >> +%macro SAD 1 >> +cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h >> +%if %1 == mmsize >> + shr hd, 1 >> +%define STRIDE strideq >> +%else >> +%define STRIDE 8 >> +%endif >> + pxor m2, m2 >> + >> +align 16 >> +.loop >> + movu m0, [pix2q] >> + movu m1, [pix2q+STRIDE] >> + psadbw m0, [pix1q] >> + psadbw m1, [pix1q+STRIDE] >> + paddw m2, m0 >> + paddw m2, m1 >> +%if %1 == mmsize >> + lea pix1q, [pix1q+strideq*2] >> + lea pix2q, [pix2q+strideq*2] >> +%else >> + add pix1q, strideq >> + add pix2q, strideq >> +%endif > >> + dec hd >> + jg .loop > > the other loops use jnz, why the difference ? > Probably a copy-paste remnant. I'll make them consistent. > > >> +%if mmsize == 16 >> + movhlps m0, m2 >> + paddw m2, m0 >> +%endif >> + movd eax, m2 >> + RET >> +%endmacro >> + >> +INIT_MMX mmxext >> +SAD 8 >> +SAD 16 >> +INIT_XMM sse2 >> +SAD 16 >> + >> +;------------------------------------------------------------------------------------------ >> +;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, int >> stride, int h); >> +;------------------------------------------------------------------------------------------ >> +%macro SAD_X2 1 >> +cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h >> +%if %1 == mmsize >> + shr hd, 1 >> +%define STRIDE strideq >> +%else >> +%define STRIDE 8 >> +%endif >> + pxor m0, m0 >> + > >> +align 16 > > do these improve or reduce the speed ? No idea. I copied them from the inline version (where they were ".p2align 4") to keep the resulting asm as similar as possible. I'll check nonetheless. > > > >> +.loop: >> + movu m1, [pix2q] >> + movu m2, [pix2q+STRIDE] >> +%if cpuflag(sse2) >> + movu m3, [pix2q+1] >> + movu m4, [pix2q+STRIDE+1] >> + pavgb m1, m3 >> + pavgb m2, m4 >> +%else >> + pavgb m1, [pix2q+1] >> + pavgb m2, [pix2q+STRIDE+1] >> +%endif >> + psadbw m1, [pix1q] >> + psadbw m2, [pix1q+STRIDE] >> + paddw m0, m1 >> + paddw m0, m2 >> +%if %1 == mmsize >> + lea pix1q, [pix1q+2*strideq] >> + lea pix2q, [pix2q+2*strideq] >> +%else >> + add pix1q, strideq >> + add pix2q, strideq >> +%endif > >> + dec hd > > dec/inc has some speed penalties on some cpus > see 16.2 in http://www.agner.org/optimize/optimizing_assembly.pdf Ok, i'll use sub then. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel