~560 → ~500 decicycles This is following the comments from Michael in https://ffmpeg.org/pipermail/ffmpeg-devel/2014-August/160599.html
Using 2 registers for accumulator didn't help. On the other hand, some re-ordering between the movs and psadbw allowed going ~538 to ~500. --- libavutil/x86/pixelutils.asm | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libavutil/x86/pixelutils.asm b/libavutil/x86/pixelutils.asm index 8ab0a18..15213d9 100644 --- a/libavutil/x86/pixelutils.asm +++ b/libavutil/x86/pixelutils.asm @@ -134,16 +134,20 @@ cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 %macro SAD_XMM_16x16 1 INIT_XMM sse2 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 - pxor m2, m2 -%rep 8 - mov%1 m0, [src2q] + mov%1 m2, [src2q] + psadbw m2, [src1q] mov%1 m1, [src2q + stride2q] - psadbw m0, [src1q] psadbw m1, [src1q + stride1q] - paddw m2, m0 paddw m2, m1 +%rep 7 lea src1q, [src1q + 2*stride1q] lea src2q, [src2q + 2*stride2q] + mov%1 m0, [src2q] + psadbw m0, [src1q] + mov%1 m1, [src2q + stride2q] + psadbw m1, [src1q + stride1q] + paddw m2, m0 + paddw m2, m1 %endrep movhlps m0, m2 paddw m2, m0 -- 2.0.4 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel