m15 is zeroed but never used. If it's not needed, decrease by 1 the number of xmm regs used (prores), otherwise, make use of it, for the rounder in the row pass of simple_idct. --- libavcodec/x86/proresdsp.asm | 8 ++++---- libavcodec/x86/simple_idct10.asm | 9 +++++---- libavcodec/x86/simple_idct10_template.asm | 17 ++++++++--------- 3 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/libavcodec/x86/proresdsp.asm b/libavcodec/x86/proresdsp.asm index 18cf15b..3fb71ba 100644 --- a/libavcodec/x86/proresdsp.asm +++ b/libavcodec/x86/proresdsp.asm @@ -37,17 +37,17 @@ cextern pw_1019 section .text align=16 -%macro idct_put_fn 1 -cglobal prores_idct_put_10, 4, 4, %1 +%macro idct_put_fn 0 +cglobal prores_idct_put_10, 4, 4, 15 IDCT_PUT_FN pw_1, 15, pw_88, 18, pw_4, pw_1019, r3 RET %endmacro INIT_XMM sse2 -idct_put_fn 16 +idct_put_fn %if HAVE_AVX_EXTERNAL INIT_XMM avx -idct_put_fn 16 +idct_put_fn %endif %endif diff --git a/libavcodec/x86/simple_idct10.asm b/libavcodec/x86/simple_idct10.asm index 982fb1e..cd2b905 100644 --- a/libavcodec/x86/simple_idct10.asm +++ b/libavcodec/x86/simple_idct10.asm @@ -37,17 +37,18 @@ pd_round: times 4 dd 1<<(12-1) section .text align=16 -%macro idct_put_fn 1 -cglobal simple_idct10_put, 3, 3, %1 +%macro idct_put_fn 0 +cglobal simple_idct10_put, 3, 3, 16 + mova m15, [pd_round] IDCT_PUT_FN "", 12, pw_16, 19, 0, pw_1023 RET %endmacro INIT_XMM sse2 -idct_put_fn 16 +idct_put_fn %if HAVE_AVX_EXTERNAL INIT_XMM avx -idct_put_fn 16 +idct_put_fn %endif %endif diff --git a/libavcodec/x86/simple_idct10_template.asm b/libavcodec/x86/simple_idct10_template.asm index 86c2765..d4a08f8 100644 --- a/libavcodec/x86/simple_idct10_template.asm +++ b/libavcodec/x86/simple_idct10_template.asm @@ -90,14 +90,14 @@ cextern w7_min_w5 pmaddwd m1, [w4_plus_w2] %ifstr %1 ; 1<<(%1-1) - paddd m2, [pd_round] - paddd m3, [pd_round] - paddd m4, [pd_round] - paddd m5, [pd_round] - paddd m6, [pd_round] - paddd m7, [pd_round] - paddd m0, [pd_round] - paddd m1, [pd_round] + paddd m2, m15 + paddd m3, m15 + paddd m4, m15 + paddd m5, m15 + paddd m6, m15 + paddd m7, m15 + paddd m0, m15 + paddd m1, m15 %endif ; a0: -1*row[0]-1*row[2] @@ -237,7 +237,6 @@ cextern w7_min_w5 %macro IDCT_PUT_FN 6-7 movsxd r1, r1d - pxor m15, m15 ; zero ; for (i = 0; i < 8; i++) ; idctRowCondDC(block + i*8); -- 2.6.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel