vp9itxfm{, _16bpp}: Remove MMXEXT functions overridden by SSSE3

Andreas Rheinhardt via ffmpeg-cvslog Mon, 08 Dec 2025 11:01:46 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 36204fbc3c593f6d13a3dac7a2db448941e3a9e5
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 30 20:49:51 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Mon Dec 8 19:27:51 2025 +0100

    avcodec/vp9itxfm{,_16bpp}: Remove MMXEXT functions overridden by SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 versions.
    This commit therefore removes the MMXEXT functions overridden
    by them (which don't abide by the ABI) to get closer to a removal
    of emms_c.
    
    Reviewed-by: Ronald S. Bultje <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/vp9dsp_init.c                |  2 --
 libavcodec/x86/vp9dsp_init_16bpp_template.c |  4 ----
 libavcodec/x86/vp9itxfm.asm                 | 30 +----------------------------
 libavcodec/x86/vp9itxfm_16bpp.asm           | 18 +----------------
 4 files changed, 2 insertions(+), 52 deletions(-)

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 85332da2b9..e479fd25ee 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -101,7 +101,6 @@ itxfm_func(iadst, idct,  size, opt); \
 itxfm_func(idct,  iadst, size, opt); \
 itxfm_func(iadst, iadst, size, opt)
 
-itxfm_func(idct,  idct,  4, mmxext);
 itxfm_func(idct,  iadst, 4, sse2);
 itxfm_func(iadst, idct,  4, sse2);
 itxfm_func(iadst, iadst, 4, sse2);
@@ -284,7 +283,6 @@ av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp, int 
bpp, int bitexact)
         dsp->loop_filter_8[1][1] = ff_vp9_loop_filter_v_8_8_mmxext;
         init_fpel_func(4, 1,  4, avg, _8, mmxext);
         init_fpel_func(3, 1,  8, avg, _8, mmxext);
-        dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_mmxext;
         dsp->intra_pred[TX_4X4][HOR_DOWN_PRED] = ff_vp9_ipred_hd_4x4_mmxext;
         dsp->intra_pred[TX_4X4][VERT_LEFT_PRED] = ff_vp9_ipred_vl_4x4_mmxext;
     }
diff --git a/libavcodec/x86/vp9dsp_init_16bpp_template.c 
b/libavcodec/x86/vp9dsp_init_16bpp_template.c
index 54ff8892cf..969db94d3c 100644
--- a/libavcodec/x86/vp9dsp_init_16bpp_template.c
+++ b/libavcodec/x86/vp9dsp_init_16bpp_template.c
@@ -123,7 +123,6 @@ decl_ipred_fns(tm, BPC, mmxext, sse2);
 
 decl_itxfm_func(iwht, iwht, 4, BPC, mmxext);
 #if BPC == 10
-decl_itxfm_func(idct,  idct,  4, BPC, mmxext);
 decl_itxfm_funcs(4, BPC, ssse3);
 decl_itxfm_funcs(16, BPC, avx512icl);
 decl_itxfm_func(idct,  idct, 32, BPC, avx512icl);
@@ -184,9 +183,6 @@ av_cold void INIT_FUNC(VP9DSPContext *dsp, int bitexact)
         init_ipred_func(tm, TM_VP8, 4, BPC, mmxext);
         if (!bitexact) {
             init_itx_func_one(4 /* lossless */, iwht, iwht, 4, BPC, mmxext);
-#if BPC == 10
-            init_itx_func(TX_4X4, DCT_DCT, idct, idct, 4, 10, mmxext);
-#endif
         }
     }
 
diff --git a/libavcodec/x86/vp9itxfm.asm b/libavcodec/x86/vp9itxfm.asm
index fe650d519c..bd5966646c 100644
--- a/libavcodec/x86/vp9itxfm.asm
+++ b/libavcodec/x86/vp9itxfm.asm
@@ -223,49 +223,28 @@ cglobal vp9_iwht_iwht_4x4_add, 3, 3, 0, dst, stride, 
block, eob
     VP9_STORE_2X         2,  3,  6,  7,  4
 %endmacro
 
-%macro IDCT_4x4_FN 1
-INIT_MMX %1
+INIT_MMX ssse3
 cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, block, eob
 
-%if cpuflag(ssse3)
     cmp eobd, 4 ; 2x2 or smaller
     jg .idctfull
 
     cmp eobd, 1 ; faster path for when only DC is set
     jne .idct2x2
-%else
-    cmp eobd, 1
-    jg .idctfull
-%endif
 
-%if cpuflag(ssse3)
     movd                m0, [blockq]
     mova                m5, [pw_11585x2]
     pmulhrsw            m0, m5
     pmulhrsw            m0, m5
-%else
-    DEFINE_ARGS dst, stride, block, coef
-    movsx            coefd, word [blockq]
-    imul             coefd, 11585
-    add              coefd, 8192
-    sar              coefd, 14
-    imul             coefd, 11585
-    add              coefd, (8 << 14) + 8192
-    sar              coefd, 14 + 4
-    movd                m0, coefd
-%endif
     pshufw              m0, m0, 0
     pxor                m4, m4
     movh          [blockq], m4
-%if cpuflag(ssse3)
     pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> 
(x+8)>>4
-%endif
     VP9_STORE_2X         0,  0,  6,  7,  4
     lea               dstq, [dstq+2*strideq]
     VP9_STORE_2X         0,  0,  6,  7,  4
     RET
 
-%if cpuflag(ssse3)
 ; faster path for when only top left 2x2 block is set
 .idct2x2:
     movd                m0, [blockq+0]
@@ -285,16 +264,13 @@ cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, 
block, eob
     movh       [blockq+ 8], m4
     VP9_IDCT4_WRITEOUT
     RET
-%endif
 
 .idctfull: ; generic full 4x4 idct/idct
     mova                m0, [blockq+ 0]
     mova                m1, [blockq+ 8]
     mova                m2, [blockq+16]
     mova                m3, [blockq+24]
-%if cpuflag(ssse3)
     mova                m6, [pw_11585x2]
-%endif
     mova                m7, [pd_8192]       ; rounding
     VP9_IDCT4_1D
     TRANSPOSE4x4W  0, 1, 2, 3, 4
@@ -306,10 +282,6 @@ cglobal vp9_idct_idct_4x4_add, 4, 4, 0, dst, stride, 
block, eob
     mova       [blockq+24], m4
     VP9_IDCT4_WRITEOUT
     RET
-%endmacro
-
-IDCT_4x4_FN mmxext
-IDCT_4x4_FN ssse3
 
 
;-------------------------------------------------------------------------------------------
 ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t 
*block, int eob);
diff --git a/libavcodec/x86/vp9itxfm_16bpp.asm 
b/libavcodec/x86/vp9itxfm_16bpp.asm
index ebe6222285..161c73f5a1 100644
--- a/libavcodec/x86/vp9itxfm_16bpp.asm
+++ b/libavcodec/x86/vp9itxfm_16bpp.asm
@@ -243,29 +243,21 @@ IWHT4_FN 12, 4095
 ; 4x4 coefficients are 5+depth+sign bits, so for 10bpp, everything still fits
 ; in 15+1 words without additional effort, since the coefficients are 15bpp.
 
-%macro IDCT4_10_FN 0
+INIT_MMX ssse3
 cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, block, eob
     cmp               eobd, 1
     jg .idctfull
 
     ; dc-only
     pxor                m4, m4
-%if cpuflag(ssse3)
     movd                m0, [blockq]
     movd          [blockq], m4
     mova                m5, [pw_11585x2]
     pmulhrsw            m0, m5
     pmulhrsw            m0, m5
-%else
-    DEFINE_ARGS dst, stride, block, coef
-    DC_ONLY              4, m4
-    movd                m0, coefd
-%endif
     pshufw              m0, m0, 0
     mova                m5, [pw_1023]
-%if cpuflag(ssse3)
     pmulhrsw            m0, [pw_2048]       ; (x*2048 + (1<<14))>>15 <=> 
(x+8)>>4
-%endif
     VP9_STORE_2X         0,  0,  6,  7,  4,  5
     lea               dstq, [dstq+2*strideq]
     VP9_STORE_2X         0,  0,  6,  7,  4,  5
@@ -281,9 +273,7 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, 
block, eob
     packssdw            m2, [blockq+2*16+8]
     packssdw            m3, [blockq+3*16+8]
 
-%if cpuflag(ssse3)
     mova                m6, [pw_11585x2]
-%endif
     mova                m7, [pd_8192]       ; rounding
     VP9_IDCT4_1D
     TRANSPOSE4x4W  0, 1, 2, 3, 4
@@ -293,12 +283,6 @@ cglobal vp9_idct_idct_4x4_add_10, 4, 4, 8, dst, stride, 
block, eob
     ZERO_BLOCK      blockq, 16, 4, m4
     VP9_IDCT4_WRITEOUT
     RET
-%endmacro
-
-INIT_MMX mmxext
-IDCT4_10_FN
-INIT_MMX ssse3
-IDCT4_10_FN
 
 %macro IADST4_FN 4
 cglobal vp9_%1_%3_4x4_add_10, 3, 3, 0, dst, stride, block, eob

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 03/08: avcodec/vp9itxfm{, _16bpp}: Remove MMXEXT functions overridden by SSSE3

Reply via email to