vp9mc: Avoid MMX regs in width 4 hor 8tap funcs

Andreas Rheinhardt via ffmpeg-cvslog Mon, 08 Dec 2025 11:02:12 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 38e2174ce40e5c49f05cb0be81d614c906dbd2c3
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Dec 2 13:47:32 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Dec 8 19:34:35 2025 +0100

    avcodec/x86/vp9mc: Avoid MMX regs in width 4 hor 8tap funcs
    
    Using wider registers (and pshufb) allows to halve the number of
    pmaddubsw used. It is also ABI compliant (no more missing emms).
    
    Old benchmarks:
    vp9_avg_8tap_smooth_4h_8bpp_c:                          97.6 ( 1.00x)
    vp9_avg_8tap_smooth_4h_8bpp_ssse3:                      15.0 ( 6.52x)
    vp9_avg_8tap_smooth_4hv_8bpp_c:                        342.9 ( 1.00x)
    vp9_avg_8tap_smooth_4hv_8bpp_ssse3:                     54.0 ( 6.35x)
    vp9_put_8tap_smooth_4h_8bpp_c:                          94.9 ( 1.00x)
    vp9_put_8tap_smooth_4h_8bpp_ssse3:                      14.2 ( 6.67x)
    vp9_put_8tap_smooth_4hv_8bpp_c:                        325.9 ( 1.00x)
    vp9_put_8tap_smooth_4hv_8bpp_ssse3:                     52.5 ( 6.20x)
    
    New benchmarks:
    vp9_avg_8tap_smooth_4h_8bpp_c:                          97.6 ( 1.00x)
    vp9_avg_8tap_smooth_4h_8bpp_ssse3:                      10.8 ( 9.08x)
    vp9_avg_8tap_smooth_4hv_8bpp_c:                        342.4 ( 1.00x)
    vp9_avg_8tap_smooth_4hv_8bpp_ssse3:                     38.8 ( 8.82x)
    vp9_put_8tap_smooth_4h_8bpp_c:                          94.7 ( 1.00x)
    vp9_put_8tap_smooth_4h_8bpp_ssse3:                       9.7 ( 9.75x)
    vp9_put_8tap_smooth_4hv_8bpp_c:                        321.7 ( 1.00x)
    vp9_put_8tap_smooth_4hv_8bpp_ssse3:                     37.0 ( 8.69x)
    
    Reviewed-by: Ronald S. Bultje <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp9mc.asm | 50 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/libavcodec/x86/vp9mc.asm b/libavcodec/x86/vp9mc.asm
index 85249bb507..42f9074c21 100644
--- a/libavcodec/x86/vp9mc.asm
+++ b/libavcodec/x86/vp9mc.asm
@@ -114,6 +114,9 @@ FILTER sse2
 ; int16_t ff_filters_16bpp[3][15][4][16]
 FILTER 16bpp
 
+filter4_h_perm0: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
+filter4_h_perm1: db 1, 2, 2, 3, 3, 4, 4, 5, 3, 4, 4, 5, 5, 6, 6, 7
+
 %if HAVE_AVX512ICL_EXTERNAL && ARCH_X86_64
 ALIGN 64
 spel_h_perm16:  db  0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 
 6
@@ -280,12 +283,51 @@ INIT_XMM sse2
 filter_sse2_h_fn put
 filter_sse2_h_fn avg
 
+%macro filter4_h_fn 2
+cglobal vp9_%1_8tap_1d_h_4_8, 6, 6, %2, dst, dstride, src, sstride, h, filtery
+    mova        m2, [filter4_h_perm0]
+    mova        m3, [filter4_h_perm1]
+    pcmpeqw     m4, m4
+    movu        m5, [filteryq+24]
+    movu        m6, [filteryq+88]
+    psllw       m4, 6   ; pw_m64
+.loop:
+    movq        m0, [srcq-3]
+    movq        m1, [srcq+0]
+    pshufb      m0, m2
+    pshufb      m1, m3
+    pmaddubsw   m0, m5
+    pmaddubsw   m1, m6
+%ifidn %1, avg
+    movd        m7, [dstq]
+%endif
+    add       srcq, sstrideq
+    paddw       m0, m1
+    movhlps     m1, m0
+    psubw       m0, m4
+    paddsw      m0, m1
+    psraw       m0, 7
+    packuswb    m0, m0
+%ifidn %1, avg
+    pavgb       m0, m7
+%endif
+    movd    [dstq], m0
+    add       dstq, dstrideq
+    sub         hd, 1
+    jg .loop
+    RET
+%endmacro
+
+INIT_XMM ssse3
+filter4_h_fn put, 7
+filter4_h_fn avg, 8
+
 %macro filter_h_fn 1
 %assign %%px mmsize/2
 cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, dstride, src, sstride, 
h, filtery
     mova        m6, [pw_256]
     mova        m7, [filteryq+ 0]
-%if ARCH_X86_64 && mmsize > 8
+%ifdef m8
     mova        m8, [filteryq+32]
     mova        m9, [filteryq+64]
     mova       m10, [filteryq+96]
@@ -305,7 +347,7 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, 
dstride, src, sstride, h
     punpcklbw   m4, m5
     punpcklbw   m1, m3
     pmaddubsw   m0, m7
-%if ARCH_X86_64 && mmsize > 8
+%ifdef m8
     pmaddubsw   m2, m8
     pmaddubsw   m4, m9
     pmaddubsw   m1, m10
@@ -332,10 +374,6 @@ cglobal vp9_%1_8tap_1d_h_ %+ %%px %+ _8, 6, 6, 11, dst, 
dstride, src, sstride, h
     RET
 %endmacro
 
-INIT_MMX ssse3
-filter_h_fn put
-filter_h_fn avg
-
 INIT_XMM ssse3
 filter_h_fn put
 filter_h_fn avg

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 06/08: avcodec/x86/vp9mc: Avoid MMX regs in width 4 hor 8tap funcs

Reply via email to