ffmpeg | branch: master | Clément Bœsch <u...@pkh.me> | Thu Aug 14 22:30:55 2014 +0200| [45c7f3997ea11c3d1007b2126b1c0049a8c27105] | committer: Clément Bœsch
avutil/pixelutils: faster pixelutils_sad_[au]_16x16 ~560 → ~500 decicycles This is following the comments from Michael in https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html Using 2 registers for accumulator didn't help. On the other hand, some re-ordering between the movs and psadbw allowed going ~538 to ~500. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=45c7f3997ea11c3d1007b2126b1c0049a8c27105 --- libavutil/x86/pixelutils.asm | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 8ab0a18..15213d9 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 %macro SAD_XMM_16x16 1 INIT_XMM sse2 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 - pxor m2, m2 -%rep 8 - mov%1 m0, [src2q] + mov%1 m2, [src2q] + psadbw m2, [src1q] mov%1 m1, [src2q + stride2q] - psadbw m0, [src1q] psadbw m1, [src1q + stride1q] - paddw m2, m0 paddw m2, m1 +%rep 7 lea src1q, [src1q + 2*stride1q] lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 %endrep movhlps m0, m2 paddw m2, m0 _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog