This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit 38e2174ce40e5c49f05cb0be81d614c906dbd2c3 Author: Andreas Rheinhardt <[email protected]> AuthorDate: Tue Dec 2 13:47:32 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Mon Dec 8 19:34:35 2025 +0100 avcodec/x86/vp9mc: Avoid MMX regs in width 4 hor 8tap funcs Using wider registers (and pshufb) allows to halve the number of pmaddubsw used. It is also ABI compliant (no more missing emms). Old benchmarks: vp9_avg_8tap_smooth_4h_8bpp_c: 97.6 ( 1.00x) vp9_avg_8tap_smooth_4h_8bpp_ssse3: 15.0 ( 6.52x) vp9_avg_8tap_smooth_4hv_8bpp_c: 342.9 ( 1.00x) vp9_avg_8tap_smooth_4hv_8bpp_ssse3: 54.0 ( 6.35x) vp9_put_8tap_smooth_4h_8bpp_c: 94.9 ( 1.00x) vp9_put_8tap_smooth_4h_8bpp_ssse3: 14.2 ( 6.67x) vp9_put_8tap_smooth_4hv_8bpp_c: 325.9 ( 1.00x) vp9_put_8tap_smooth_4hv_8bpp_ssse3: 52.5 ( 6.20x) New benchmarks: vp9_avg_8tap_smooth_4h_8bpp_c: 97.6 ( 1.00x) vp9_avg_8tap_smooth_4h_8bpp_ssse3: 10.8 ( 9.08x) vp9_avg_8tap_smooth_4hv_8bpp_c: 342.4 ( 1.00x) vp9_avg_8tap_smooth_4hv_8bpp_ssse3: 38.8 ( 8.82x) vp9_put_8tap_smooth_4h_8bpp_c: 94.7 ( 1.00x) vp9_put_8tap_smooth_4h_8bpp_ssse3: 9.7 ( 9.75x) vp9_put_8tap_smooth_4hv_8bpp_c: 321.7 ( 1.00x) vp9_put_8tap_smooth_4hv_8bpp_ssse3: 37.0 ( 8.69x) Reviewed-by: Ronald S. Bultje <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp9mc.asm | 50 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm index 85249bb507..42f9074c21 100644 --- a/libavcodec/x86/vp9mc.asm +++ b/libavcodec/x86/vp9mc.asm @@ -114,6 +114,9 @@ FILTER sse2 ; int16_t ff_filters_16bpp[3][15][4][16] FILTER 16bpp +filter4_h_perm0: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6 +filter4_h_perm1: db 1, 2, 2, 3, 3, 4, 4, 5, 3, 4, 4, 5, 5, 6, 6, 7 + %if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64 ALIGN 64 spel_h_perm16: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 @@ -280,12 +283,51 @@ INIT_XMM sse2 filter_sse2_h_fn put filter_sse2_h_fn avg +%macro filter4_h_fn 2 +cglobal vp9_%1_8tap_1d_h_4_8, 6, 6, %2, dst, dstride, src, sstride, h, filtery + mova m2, [filter4_h_perm0] + mova m3, [filter4_h_perm1] + pcmpeqw m4, m4 + movu m5, [filteryq+24] + movu m6, [filteryq+88] + psllw m4, 6 ; pw_m64 +.loop: + movq m0, [srcq-3] + movq m1, [srcq+0] + pshufb m0, m2 + pshufb m1, m3 + pmaddubsw m0, m5 + pmaddubsw m1, m6 +%ifidn %1, avg + movd m7, [dstq] +%endif + add srcq, sstrideq + paddw m0, m1 + movhlps m1, m0 + psubw m0, m4 + paddsw m0, m1 + psraw m0, 7 + packuswb m0, m0 +%ifidn %1, avg + pavgb m0, m7 +%endif + movd [dstq], m0 + add dstq, dstrideq + sub hd, 1 + jg .loop + RET +%endmacro + +INIT_XMM ssse3 +filter4_h_fn put, 7 +filter4_h_fn avg, 8 + %macro filter_h_fn 1 %assign %%px mmsize/2 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h, filtery mova m6, [pw_256] mova m7, [filteryq+ 0] -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 mova m8, [filteryq+32] mova m9, [filteryq+64] mova m10, [filteryq+96] @@ -305,7 +347,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h punpcklbw m4, m5 punpcklbw m1, m3 pmaddubsw m0, m7 -%if ARCH_X86_64 && mmsize > 8 +%ifdef m8 pmaddubsw m2, m8 pmaddubsw m4, m9 pmaddubsw m1, m10 @@ -332,10 +374,6 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, h RET %endmacro -INIT_MMX ssse3 -filter_h_fn put -filter_h_fn avg - INIT_XMM ssse3 filter_h_fn put filter_h_fn avg _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
