This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit dd5dc254ff505b84e0a88361aa2d1e1280f579e7 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Mon Dec 1 00:17:55 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Dec 8 19:31:59 2025 +0100 avcodec/x86/vp9mc: Avoid reloads, MMX regs in width 4 vert 8tap func Four rows of four bytes fit into one xmm register; therefore one can arrange the rows as follows (A,B,C: first, second, third etc. row) xmm0: ABABABAB BCBCBCBC xmm1: CDCDCDCD DEDEDEDE xmm2: EFEFEFEF FGFGFGFG xmm3: GHGHGHGH HIHIHIHI and use four pmaddubsw to calculate two rows in parallel. The history fits into four registers, making this possible even on 32bit systems. Old benchmarks (Unix 64): vp9_avg_8tap_smooth_4v_8bpp_c: 105.5 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 16.4 ( 6.44x) vp9_put_8tap_smooth_4v_8bpp_c: 99.3 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 15.4 ( 6.44x) New benchmarks (Unix 64): vp9_avg_8tap_smooth_4v_8bpp_c: 105.0 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 11.8 ( 8.90x) vp9_put_8tap_smooth_4v_8bpp_c: 99.7 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 10.7 ( 9.30x) Old benchmarks (x86-32): vp9_avg_8tap_smooth_4v_8bpp_c: 138.2 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 28.0 ( 4.93x) vp9_put_8tap_smooth_4v_8bpp_c: 123.6 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 28.0 ( 4.41x) New benchmarks (x86-32): vp9_avg_8tap_smooth_4v_8bpp_c: 139.0 ( 1.00x) vp9_avg_8tap_smooth_4v_8bpp_ssse3: 20.1 ( 6.92x) vp9_put_8tap_smooth_4v_8bpp_c: 124.5 ( 1.00x) vp9_put_8tap_smooth_4v_8bpp_ssse3: 19.9 ( 6.26x) Loading the constants into registers did not turn out to be advantageous here (not to mention Win64, where this would necessitate saving and restoring ever more register); probably because there are only two loop iterations. Reviewed-by: Ronald S. Bultje <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 88 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 682c6a6ea0..85249bb507 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -496,12 +496,84 @@ INIT_XMM sse2 filter_sse2_v_fn put filter_sse2_v_fn avg +%macro filter4_v_fn 1 +%if ARCH_X86_64 +cglobal vp9_%1_8tap_1d_v_4_8, 6, 7, 8, dst, dstride, src, sstride, h, filtery, sstride3 +%else +cglobal vp9_%1_8tap_1d_v_4_8, 4, 5, 8, dst, dstride, src, sstride, filtery +%define hd r4mp +%define sstride3q filteryq +%endif + lea sstride3q, [sstrideq*3] + sub srcq, sstride3q + movd m0, [srcq] + movd m1, [srcq+sstrideq] + movd m2, [srcq+sstrideq*2] + movd m3, [srcq+sstride3q] + lea srcq, [srcq+sstrideq*4] + movd m4, [srcq] + movd m5, [srcq+sstrideq] + punpcklbw m0, m1 + punpcklbw m1, m2 + punpcklbw m2, m3 + punpcklbw m3, m4 + punpcklqdq m0, m1 + movd m1, [srcq+sstrideq*2] + add srcq, sstride3q +%if ARCH_X86_32 + mov filteryq, r5mp +%endif + punpcklqdq m2, m3 + punpcklbw m4, m5 + punpcklbw m5, m1 + punpcklqdq m4, m5 +.loop: + pmaddubsw m0, [filteryq] + movd m3, [srcq] + movd m5, [srcq+sstrideq] + pmaddubsw m7, m4, [filteryq+64] + pmaddubsw m6, m2, [filteryq+32] + punpcklbw m1, m3 + punpcklbw m3, m5 + punpcklqdq m1, m3 + pmaddubsw m3, m1, [filteryq+96] + paddw m0, [pw_64] + lea srcq, [srcq+2*sstrideq] + paddw m7, m0 + mova m0, m2 + mova m2, m4 +%ifidn %1, avg + movd m4, [dstq] +%endif + paddw m6, m3 +%ifidn %1, avg + movd m3, [dstq+dstrideq] +%endif + paddsw m6, m7 + psraw m6, 7 + packuswb m6, m6 + pshuflw m7, m6, 0xE +%ifidn %1, avg + pavgb m6, m4 +%endif + movd [dstq], m6 + mova m4, m1 +%ifidn %1, avg + pavgb m7, m3 +%endif + movd [dstq+dstrideq], m7 + lea dstq, [dstq+2*dstrideq] + mova m1, m5 + sub hd, 2 + jg .loop + RET +%endmacro + %macro filter_v_fn 1 -%assign %%px mmsize/2 %if ARCH_X86_64 -cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 +cglobal vp9_%1_8tap_1d_v_8_8, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 %else -cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 +cglobal vp9_%1_8tap_1d_v_8_8, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 mov filteryq, r5mp %define hd r4mp %endif @@ -510,7 +582,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f lea src4q, [srcq+sstrideq] sub srcq, sstride3q mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 mova m8, [filteryq+32] mova m9, [filteryq+64] mova m10, [filteryq+96] @@ -533,7 +605,7 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f punpcklbw m4, m5 punpcklbw m1, m3 pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%if ARCH_X86_64 pmaddubsw m2, m8 pmaddubsw m4, m9 pmaddubsw m1, m10 @@ -560,9 +632,9 @@ cglobal vp9_%1_8tap_1d_v_ %+ %%px %+ _8, 4, 7, 11, dst, dstride, src, sstride, f RET %endmacro -INIT_MMX ssse3 -filter_v_fn put -filter_v_fn avg +INIT_XMM ssse3 +filter4_v_fn put +filter4_v_fn avg INIT_XMM ssse3 filter_v_fn put _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
