This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 714cbf1c70a8312ea11f1fb718d56762ff6658cb
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 24 09:16:26 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Thu Dec 4 15:17:37 2025 +0100

    avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v4_ssse3
    
    Switching to xmm registers allows to process two rows in parallel,
    leading to speedups. It is also ABI compliant (no more missing emms).
    
    Old benchmarks:
    vp8_put_epel4_v4_c:                                     96.8 ( 1.00x)
    vp8_put_epel4_v4_ssse3:                                 28.2 ( 3.43x)
    
    New benchmarks:
    vp8_put_epel4_v4_c:                                     95.1 ( 1.00x)
    vp8_put_epel4_v4_ssse3:                                 22.8 ( 4.17x)
    
    Reviewed-by: Ronald S. Bultje <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp8dsp.asm | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm
index 4778944ac7..fd60feaf1f 100644
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -238,6 +238,7 @@ cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, 
dststride, src, srcstride, h
     jg .nextrow
     RET
 
+INIT_XMM ssse3
 cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     shl      myd, 4
 %if PIC
@@ -250,13 +251,38 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     ; read 3 lines
     mov  picregq, srcstrideq
     neg  picregq
-    movh      m0, [srcq+picregq]
-    movh      m1, [srcq]
-    movh      m2, [srcq+srcstrideq]
+    MOV       m0, [srcq+picregq]
+    MOV       m1, [srcq]
+    MOV       m2, [srcq+srcstrideq]
+    lea     srcq, [srcq+2*srcstrideq]
     punpcklbw m0, m2
 
+%if %1 == 4
+.next2rows:
+    movd       m3, [srcq]
+    movd       m4, [srcq+srcstrideq]
+    punpcklbw  m1, m3
+    punpcklqdq m0, m1
+    punpcklbw  m2, m4
+    pmaddubsw  m0, m5
+    punpcklqdq m1, m2
+    pmaddubsw  m1, m6
+    lea     srcq, [srcq+2*srcstrideq]
+    paddsw     m1, m0
+    pmulhrsw   m1, m7
+    mova       m0, m2
+    packuswb   m1, m1
+    movd   [dstq], m1
+    mova       m2, m4
+    psrldq     m1, 4
+    movd [dstq+dststrideq], m1
+    mova       m1, m3
+    lea      dstq, [dstq+2*dststrideq]
+    sub   heightd, 2
+    jg .next2rows
+%else
 .nextrow:
-    movh      m3, [srcq+2*srcstrideq]      ; read new row
+    movh      m3, [srcq]      ; read new row
     pmaddubsw m0, m5
     punpcklbw m1, m3
     pmaddubsw m4, m1, m6
@@ -273,9 +299,9 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, 
srcstride, height, picr
     add      dstq, dststrideq
     dec   heightd                          ; next row
     jg .nextrow
+%endif
     RET
 
-INIT_XMM ssse3
 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, 
picreg, my
     lea      myd, [myq*3]
 %if PIC

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to