This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.

commit 3354cbf1b76ff687cae4852fc15b6174c4e36b48
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon May 4 20:28:37 2026 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Jul 1 20:00:46 2026 +0200

    avcodec/x86/vc1dsp_mc: Add size 16 horizontal SSSE3 mc functions
    
    vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_c:             309.1 ( 1.00x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_mmxext:        177.3 ( 1.74x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc10_16_ssse3:          52.3 ( 5.91x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_c:             279.6 ( 1.00x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_mmxext:        148.8 ( 1.88x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc20_16_ssse3:          52.1 ( 5.37x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_c:             332.6 ( 1.00x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_mmxext:        177.3 ( 1.88x)
    vc1dsp.avg_vc1_mspel_pixels_tab_mc30_16_ssse3:          52.5 ( 6.33x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_c:             288.8 ( 1.00x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_mmx:           170.3 ( 1.70x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc10_16_ssse3:          51.3 ( 5.63x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_c:             236.2 ( 1.00x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_mmx:           144.1 ( 1.64x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc20_16_ssse3:          51.3 ( 4.61x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_c:             286.6 ( 1.00x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_mmx:           170.1 ( 1.69x)
    vc1dsp.put_vc1_mspel_pixels_tab_mc30_16_ssse3:          51.2 ( 5.60x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vc1dsp_init.c |  9 ++++++---
 libavcodec/x86/vc1dsp_mc.asm | 27 ++++++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 29c28fb2b7..02bea6b052 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -97,6 +97,9 @@ void ff_vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, ptrdiff_t 
linesize,
 #define MSPEL_FUNCS_SIZE(X, Y, SIZE, XMM) \
     MSPEL_FUNC(put, X, Y, SIZE, XMM);     \
     MSPEL_FUNC(avg, X, Y, SIZE, XMM)
+#define MSPEL_FUNCS(X, Y, XMM)            \
+    MSPEL_FUNCS_SIZE(X, Y,  8, XMM);      \
+    MSPEL_FUNCS_SIZE(X, Y, 16, XMM)
 
 av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
 {
@@ -141,9 +144,9 @@ av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_put_vc1_chroma_mc8_nornd_ssse3;
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = 
ff_avg_vc1_chroma_mc8_nornd_ssse3;
 
-        MSPEL_FUNCS_SIZE(1, 0, 8, ssse3);
-        MSPEL_FUNCS_SIZE(2, 0, 8, ssse3);
-        MSPEL_FUNCS_SIZE(3, 0, 8, ssse3);
+        MSPEL_FUNCS(1, 0, ssse3);
+        MSPEL_FUNCS(2, 0, ssse3);
+        MSPEL_FUNCS(3, 0, ssse3);
     }
     if (EXTERNAL_SSE4(cpu_flags)) {
         dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_sse4;
diff --git a/libavcodec/x86/vc1dsp_mc.asm b/libavcodec/x86/vc1dsp_mc.asm
index f6204afb23..2228254fd2 100644
--- a/libavcodec/x86/vc1dsp_mc.asm
+++ b/libavcodec/x86/vc1dsp_mc.asm
@@ -200,6 +200,7 @@ HOR_16B_SHIFT2 OP_AVG, avg
 %endif ; HAVE_MMX_INLINE
 
 %define MOV8  movq
+%define MOV16 movu
 
 INIT_XMM ssse3
 %macro HOR_8B 2
@@ -223,7 +224,7 @@ cglobal vc1_%1_mspel_mc30_%2, 4, 4, 6, dst, src, stride, rnd
 
 vc1_%1_mspel_mc30_%2_after_prologue:
     movd              m0, rndd
-    WIN64_SPILL_XMM    7
+    WIN64_SPILL_XMM    7+(%2>>4)
 %define hd  rndd
     mov               hd, %2
     SPLATW            m0, m0
@@ -233,6 +234,7 @@ vc1_%1_mspel_mc30_%2_after_prologue:
     MOV%2             m5, [srcq+1]
     MOV%2             m6, [srcq+2]
 
+%if %2 == 8
     punpcklbw         m3, m4
     pmaddubsw         m3, m1
 %ifidn %1,avg
@@ -249,6 +251,26 @@ vc1_%1_mspel_mc30_%2_after_prologue:
     pavgb             m3, m4
 %endif
     movq          [dstq], m3
+%else
+    SBUTTERFLY        bw, 3, 4, 7
+    pmaddubsw         m3, m1
+    pmaddubsw         m4, m1
+    SBUTTERFLY        bw, 6, 5, 7
+    pmaddubsw         m6, m2
+    pmaddubsw         m5, m2
+    add             srcq, strideq
+    psubw             m3, m0
+    psubw             m4, m0
+    paddw             m3, m6
+    paddw             m4, m5
+    psraw             m3, 6
+    psraw             m4, 6
+    packuswb          m3, m4
+%ifidn %1, avg
+    pavgb             m3, [dstq]
+%endif
+    mova          [dstq], m3
+%endif
     add             dstq, strideq
     dec               hd
     jnz            .loop
@@ -257,3 +279,6 @@ vc1_%1_mspel_mc30_%2_after_prologue:
 
 HOR_8B put, 8
 HOR_8B avg, 8
+
+HOR_8B put, 16
+HOR_8B avg, 16

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to