vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3

Andreas Rheinhardt via ffmpeg-cvslog Thu, 04 Dec 2025 06:41:53 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 99fb257f58026590729aa63f18c5c18f55f48c99
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 24 16:11:10 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Dec 4 15:17:37 2025 +0100

    avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3
    
    Doubling the register width allowed to avoid a pshufb and a pmaddubsw.
    
    Old benchmarks:
    vp8_put_epel4_h6_c:                                    115.9 ( 1.00x)
    vp8_put_epel4_h6_ssse3:                                 20.2 ( 5.74x)
    vp8_put_epel4_h6v4_c:                                  276.3 ( 1.00x)
    vp8_put_epel4_h6v4_ssse3:                               58.6 ( 4.71x)
    vp8_put_epel4_h6v6_c:                                  363.6 ( 1.00x)
    vp8_put_epel4_h6v6_ssse3:                               62.5 ( 5.82x)
    
    New benchmarks:
    vp8_put_epel4_h6_c:                                    116.4 ( 1.00x)
    vp8_put_epel4_h6_ssse3:                                 16.0 ( 7.29x)
    vp8_put_epel4_h6v4_c:                                  280.9 ( 1.00x)
    vp8_put_epel4_h6v4_ssse3:                               44.3 ( 6.33x)
    vp8_put_epel4_h6v6_c:                                  365.6 ( 1.00x)
    vp8_put_epel4_h6v6_ssse3:                               53.1 ( 6.89x)
    
    Reviewed-by: Ronald S. Bultje <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 50 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 12 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 6c365898ce..2a66e51da6 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -33,6 +33,16 @@ fourtap_filter4_b_m: times 4 db  -6, 123
                      times 4 db  -1,  12
                      times 4 db 123,  -6
 
+sixtap_filter4_hb_m: times 8 db   2, -11
+                     times 4 db 108,  -8
+                     times 4 db  36,   1
+                     times 8 db   3, -16
+                     times 4 db  77, -16
+                     times 4 db  77,   3
+                     times 8 db   1,  -8
+                     times 4 db  36, -11
+                     times 4 db 108,   2
+
 fourtap_filter_hb_m: times 8 db  -6, 123
                      times 8 db  12,  -1
                      times 8 db  -9,  93
@@ -129,6 +139,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define fourtap_filter4_b  picregq
 %define sixtap_filter_hb   picregq
 %define sixtap_filter_b    picregq
+%define sixtap_filter4_hb  picregq
 %define fourtap_filter_v   picregq
 %define sixtap_filter_v    picregq
 %define bilinear_filter_vw picregq
@@ -140,6 +151,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %define fourtap_filter4_b  fourtap_filter4_b_m
 %define sixtap_filter_hb   sixtap_filter_hb_m
 %define sixtap_filter_b    sixtap_filter_b_m
+%define sixtap_filter4_hb  sixtap_filter4_hb_m
 %define fourtap_filter_v   fourtap_filter_v_m
 %define sixtap_filter_v    sixtap_filter_v_m
 %define bilinear_filter_vw bilinear_filter_vw_m
@@ -148,6 +160,7 @@ bilinear_filter_vb_m: times 8 db 7, 1
 %endif
 
 filter4_h4_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3,  4, 4,  5, 5,  6
+filter4_h6_shuf: db 1, 3, 2, 4, 3, 5, 4, 6, 2, 4, 3,  5, 4,  6, 5,  7
 filter_h2_shuf:  db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,  6, 6,  7,  7,  8
 filter_h4_shuf:  db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,  8, 8,  9,  9, 10
 
@@ -180,7 +193,16 @@ SECTION .text
 %define MOV movq
 %endif
 
-cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, 
height, mx, picreg
+cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 6+2*(%1==8), dst, dststride, src, 
srcstride, height, mx, picreg
+%if %1 == 4
+    mova      m3, [filter4_h6_shuf]
+%if PIC
+    lea  picregq, [sixtap_filter4_hb_m]
+%endif
+    shl      mxd, 4
+    mova      m4, [sixtap_filter4_hb+mxq-32]
+    mova      m5, [sixtap_filter4_hb+mxq-16]
+%else
     lea      mxd, [mxq*3]
     mova      m3, [filter_h6_shuf2]
     mova      m4, [filter_h6_shuf3]
@@ -190,29 +212,35 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     mova      m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
     mova      m6, [sixtap_filter_hb+mxq*8-32]
     mova      m7, [sixtap_filter_hb+mxq*8-16]
+%endif
 
 .nextrow:
+%if %1 == 4
+    ; we need nine bytes, so two loads
+    movq      m1, [srcq-1]
+    movq      m0, [srcq-2]
+    punpcklbw m0, m1
+    pshufb    m1, m3
+    pmaddubsw m1, m5
+    pmaddubsw m0, m4
+    movhlps   m2, m1
+%else
     movu      m0, [srcq-2]
     mova      m1, m0
     mova      m2, m0
-%if mmsize == 8
-; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
-; shuffle with a memory operand
-    punpcklbw m0, [srcq+3]
-%else
     pshufb    m0, [filter_h6_shuf1]
-%endif
     pshufb    m1, m3
     pshufb    m2, m4
     pmaddubsw m0, m5
     pmaddubsw m1, m6
     pmaddubsw m2, m7
+%endif
     add     srcq, srcstrideq
-    paddsw    m0, m1
+    paddw     m0, m1
     paddsw    m0, m2
     pmulhrsw  m0, [pw_256]
     packuswb  m0, m0
-    movh  [dstq], m0        ; store
+    MOV   [dstq], m0        ; store
 
     ; go to next line
     add     dstq, dststrideq
@@ -220,7 +248,6 @@ cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
-INIT_XMM ssse3
 cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 6+!!(%1 == 8), dst, dststride, 
src, srcstride, height, mx, picreg
     mova      m2, [pw_256]
 %if %1 == 8
@@ -405,9 +432,8 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     RET
 %endmacro
 
-INIT_MMX ssse3
-FILTER_SSSE3 4
 INIT_XMM ssse3
+FILTER_SSSE3 4
 FILTER_SSSE3 8
 
 INIT_XMM sse2

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 11/15: avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_h6_ssse3

Reply via email to