asm: Remove wrong comment, rename FF_REG_sp

ffmpeg-git--- via ffmpeg-cvslog Tue, 18 Nov 2025 12:08:10 -0800

The branch, master has been updated
       via  5bf57a925ca57ba94538f64a22c7d14234794c7d (commit)
       via  99209c287687705fe1eee775cb4f7f1d0aa94a1e (commit)
       via  b890cd0f73750f0ca526a0b848f3daa48ae6eca5 (commit)
       via  aeb138679a8f97f6c4716ccd91fac3adbe7bb4d1 (commit)
       via  0d3a88e55fc443640ed3c57c9fc906b1ed8a33b8 (commit)
       via  1c00e094274b8571ea326311ff0425ba2dac0fd0 (commit)
       via  d633fa0433de093c9a1257aed519b806b1054f21 (commit)
       via  2cfef7031ca4620e4744534527fe1674963bfdda (commit)
       via  503afa40f7d6227ec25d42d40275f810940b0959 (commit)
      from  00ef656a85f245a400b0cd83a0732c892703a7ae (commit)



- Log -----------------------------------------------------------------
commit 5bf57a925ca57ba94538f64a22c7d14234794c7d
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 16 12:10:22 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100

    avutil/x86/asm: Remove wrong comment, rename FF_REG_sp
    
    Before FFmpeg commit 531b0a316b24f00965cd8a88efdbea2c6d63147f,
    FFmpeg used REG_SP as macro for the stack pointer, yet this
    clashed with a REG_SP define in Solaris system headers, so it
    was changed to REG_sp and a comment was added for this.
    
    Libav fixed it by adding an FF_ prefix to the macros in
    1e9c5bf4c136fe9e010cc8a7e7270bba0d1bf45e. FFmpeg switched
    to using these prefixes in 9eb3da2f9942cf1b1148d242bccfc383f666feb6,
    using FF_REG_sp instead of Libav's FF_REG_SP. In said commit
    the comment was changed to claim that Solaris system headers
    define FF_REG_SP, but this is (most likely) wrong.
    
    This commit removes the wrong comment and renames the (actually unused)
    macro to FF_REG_SP to make it consistent with FF_REG_BP.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavutil/x86/asm.h b/libavutil/x86/asm.h
index 9bff42d628..f06ea25035 100644
--- a/libavutil/x86/asm.h
+++ b/libavutil/x86/asm.h
@@ -38,8 +38,7 @@ typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg;
 #    define FF_PTR_SIZE "8"
 typedef int64_t x86_reg;
 
-/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */
-#    define FF_REG_sp "rsp"
+#    define FF_REG_SP "rsp"
 #    define FF_REG_BP "rbp"
 #    define FF_REGBP   rbp
 #    define FF_REGa    rax
@@ -60,7 +59,7 @@ typedef int64_t x86_reg;
 #    define FF_PTR_SIZE "4"
 typedef int32_t x86_reg;
 
-#    define FF_REG_sp "esp"
+#    define FF_REG_SP "esp"
 #    define FF_REG_BP "ebp"
 #    define FF_REGBP   ebp
 #    define FF_REGa    eax

commit 99209c287687705fe1eee775cb4f7f1d0aa94a1e
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Nov 16 11:10:07 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100

    avcodec/x86/mpegvideoenc_template: Reduce number of registers used
    
    qmat and bias always have a constant offset, so one can use one register
    to address both of them. This allows to remove the check for HAVE_6REGS
    (untested on a system where HAVE_6REGS is false).
    Also avoid FF_REG_a while at it.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index c667dcd2a2..24dd049200 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -39,8 +39,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
     36, 37, 49, 50, 58, 59, 63, 64,
 };
 
-#if HAVE_6REGS
-
 #if HAVE_SSE2_INLINE
 #define COMPILE_TEMPLATE_SSSE3  0
 #define RENAME(a)      a ## _sse2
@@ -55,8 +53,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_SSSE3_INLINE */
 
-#endif /* HAVE_6REGS */
-
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
     const int dct_algo = s->c.avctx->dct_algo;
@@ -65,11 +61,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 #if HAVE_SSE2_INLINE
         int cpu_flags = av_get_cpu_flags();
         if (INLINE_SSE2(cpu_flags)) {
-#if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
-#endif
         }
-#if HAVE_6REGS && HAVE_SSSE3_INLINE
+#if HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;
 #endif
diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index b5417f6d32..e6ce791347 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -70,7 +70,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
 {
     x86_reg last_non_zero_p1;
     int level=0, q; //=0 is because gcc says uninitialized ...
-    const uint16_t *qmat, *bias;
+    const uint16_t *qmat;
     LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
     //s->fdct (block);
@@ -86,11 +86,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         int dummy;
         if (n < 4){
             q = s->c.y_dc_scale;
-            bias = s->q_intra_matrix16[qscale][1];
             qmat = s->q_intra_matrix16[qscale][0];
         }else{
             q = s->c.c_dc_scale;
-            bias = s->q_chroma_intra_matrix16[qscale][1];
             qmat = s->q_chroma_intra_matrix16[qscale][0];
         }
         /* note: block[0] is assumed to be positive */
@@ -109,7 +107,6 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         last_non_zero_p1 = 1;
     } else {
         last_non_zero_p1 = 0;
-        bias = s->q_inter_matrix16[qscale][1];
         qmat = s->q_inter_matrix16[qscale][0];
     }
 
@@ -121,7 +118,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "pxor  %%xmm4, %%xmm4               \n\t" // 0
             "movdqa  (%2), %%xmm5               \n\t" // qmat[0]
             "pxor  %%xmm6, %%xmm6               \n\t"
-            "psubw   (%3), %%xmm6               \n\t" // -bias[0]
+            "psubw 128(%2), %%xmm6              \n\t" // -bias[0]
             "mov $-128, %%"FF_REG_a"            \n\t"
             ".p2align 4                         \n\t"
             "1:                                 \n\t"
@@ -131,9 +128,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "pmulhw  %%xmm5, %%xmm0             \n\t" // 
(ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
+            "movdqa  %%xmm0, (%4, %0)           \n\t"
             "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
-            "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+            "movdqa  (%3, %0), %%xmm1           \n\t"
             "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
@@ -143,7 +140,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "movd %%xmm3, %%"FF_REG_a"          \n\t"
             "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat), "r" (bias),
+            : "r" (block+64), "r" (qmat),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                 "%xmm4", "%xmm5", "%xmm6")
@@ -159,15 +156,15 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "1:                                 \n\t"
             "movdqa  (%1, %%"FF_REG_a"), %%xmm0 \n\t" // block[i]
             SAVE_SIGN("%%xmm1", "%%xmm0")             // ABS(block[i])
-            "movdqa  (%3, %%"FF_REG_a"), %%xmm6 \n\t" // bias[0]
+            "movdqa  128(%2, %0), %%xmm6        \n\t" // bias[i]
             "paddusw %%xmm6, %%xmm0             \n\t" // ABS(block[i]) + 
bias[0]
             "movdqa  (%2, %%"FF_REG_a"), %%xmm5 \n\t" // qmat[i]
             "pmulhw  %%xmm5, %%xmm0             \n\t" // 
(ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
-            "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
+            "movdqa  %%xmm0, (%4, %0)           \n\t"
             "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
-            "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
+            "movdqa  (%3, %0), %%xmm1           \n\t"
             "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
@@ -177,7 +174,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "movd %%xmm3, %%"FF_REG_a"          \n\t"
             "movzbl %%al, %%eax                 \n\t" // last_non_zero_p1
             : "+a" (last_non_zero_p1)
-            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
+            : "r" (block+64), "r" (qmat+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
                                 "%xmm4", "%xmm5", "%xmm6")

commit b890cd0f73750f0ca526a0b848f3daa48ae6eca5
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:56:23 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:13 2025 +0100

    avcodec/x86/mpegvideoenc_template: Avoid touching nonvolatile register
    
    xmm7 is nonvolatile on x64 Windows.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index 14e993de2b..b5417f6d32 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -117,7 +117,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
         __asm__ volatile(
             "movd %%"FF_REG_a", %%xmm3          \n\t" // last_non_zero_p1
             SPREADW("%%xmm3")
-            "pxor  %%xmm7, %%xmm7               \n\t" // 0
+            "pxor  %%xmm2, %%xmm2               \n\t" // 0
             "pxor  %%xmm4, %%xmm4               \n\t" // 0
             "movdqa  (%2), %%xmm5               \n\t" // qmat[0]
             "pxor  %%xmm6, %%xmm6               \n\t"
@@ -132,9 +132,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
             "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
-            "pcmpeqw %%xmm7, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
+            "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
             "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
-            "movdqa  %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+            "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
             "add        $16, %%"FF_REG_a"       \n\t"
@@ -146,13 +146,13 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             : "r" (block+64), "r" (qmat), "r" (bias),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                                "%xmm4", "%xmm5", "%xmm6")
         );
     }else{ // FMT_H263
         __asm__ volatile(
             "movd %%"FF_REG_a", %%xmm3          \n\t" // last_non_zero_p1
             SPREADW("%%xmm3")
-            "pxor %%xmm7, %%xmm7                \n\t" // 0
+            "pxor %%xmm2, %%xmm2                \n\t" // 0
             "pxor %%xmm4, %%xmm4                \n\t" // 0
             "mov $-128, %%"FF_REG_a"            \n\t"
             ".p2align 4                         \n\t"
@@ -166,9 +166,9 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             "por     %%xmm0, %%xmm4             \n\t"
             RESTORE_SIGN("%%xmm1", "%%xmm0")          // 
out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
             "movdqa  %%xmm0, (%5, %%"FF_REG_a") \n\t"
-            "pcmpeqw %%xmm7, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
+            "pcmpeqw %%xmm2, %%xmm0             \n\t" // out==0 ? 0xFF : 0x00
             "movdqa  (%4, %%"FF_REG_a"), %%xmm1 \n\t"
-            "movdqa  %%xmm7, (%1, %%"FF_REG_a") \n\t" // 0
+            "movdqa  %%xmm2, (%1, %%"FF_REG_a") \n\t" // 0
             "pandn   %%xmm1, %%xmm0             \n\t"
             "pmaxsw  %%xmm0, %%xmm3             \n\t"
             "add        $16, %%"FF_REG_a"       \n\t"
@@ -180,7 +180,7 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
             : "r" (block+64), "r" (qmat+64), "r" (bias+64),
               "r" (inv_zigzag_direct16 + 64), "r" (temp_block + 64)
               XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                                "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                                "%xmm4", "%xmm5", "%xmm6")
         );
     }
     __asm__ volatile(

commit aeb138679a8f97f6c4716ccd91fac3adbe7bb4d1
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:44:02 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    avcodec/x86/mpegvideoencdsp: Port add_8x8basis_ssse3() to ASM
    
    Both GCC and Clang completely unroll the unlikely loop at -O3,
    leading to codesize bloat; their code is also suboptimal, as they
    don't make use of pmulhrsw (even with -mssse3). This commit
    therefore ports the whole function to external assembly. The new
    function occupies 176B here vs 1406B for GCC.
    
    Benchmarks for a testcase with huge qscale (notice that the C version
    is unrolled just like the unlikely loop in the SSSE3 version):
    add_8x8basis_c:                                         43.4 ( 1.00x)
    add_8x8basis_ssse3 (old):                               43.6 ( 1.00x)
    add_8x8basis_ssse3 (new):                               11.9 ( 3.63x)
    
    Reviewed-by: Kieran Kunhya <[email protected]>
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoencdsp.asm 
b/libavcodec/x86/mpegvideoencdsp.asm
index 0e86a5304c..300f98b438 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -25,6 +25,58 @@
 
 SECTION .text
 
+; void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale)
+INIT_XMM ssse3
+cglobal add_8x8basis, 3, 3+ARCH_X86_64, 4, rem, basis, scale
+    movd            m0, scaled
+    add         scaled, 1024
+    add         basisq, 128
+    add           remq, 128
+%if ARCH_X86_64
+%define OFF r3q
+    mov            r3q, -128
+    cmp         scaled, 2047
+%else
+%define OFF r2q
+    cmp         scaled, 2047
+    mov            r2q, -128
+%endif
+    ja     .huge_scale
+
+    punpcklwd       m0, m0
+    pshufd          m0, m0, 0x0
+    psllw           m0, 5
+.loop1:
+    mova            m1, [basisq+OFF]
+    mova            m2, [basisq+OFF+16]
+    pmulhrsw        m1, m0
+    pmulhrsw        m2, m0
+    paddw           m1, [remq+OFF]
+    paddw           m2, [remq+OFF+16]
+    mova    [remq+OFF], m1
+    mova [remq+OFF+16], m2
+    add            OFF, 32
+    js          .loop1
+    RET
+
+.huge_scale:
+    pslld           m0, 6
+    punpcklwd       m0, m0
+    pshufd          m1, m0, 0x55
+    psrlw           m0, 1
+    pshufd          m0, m0, 0x0
+.loop2:
+    mova            m2, [basisq+OFF]
+    pmulhrsw        m3, m2, m0
+    pmullw          m2, m1
+    paddw           m2, m3
+    paddw           m2, [remq+OFF]
+    mova    [remq+OFF], m2
+    add            OFF, 16
+    js          .loop2
+    RET
+
+
 INIT_XMM sse2
 cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
     pxor            m6, m6
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
index f6169b5399..220c75785a 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -32,6 +32,7 @@ void ff_mpv_denoise_dct_sse2(int16_t block[64], int 
dct_error_sum[64],
 int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
+void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale);
 
 #if HAVE_INLINE_ASM
 #if HAVE_SSSE3_INLINE
@@ -83,41 +84,6 @@ static int try_8x8basis_ssse3(const int16_t rem[64], const 
int16_t weight[64], c
     );
     return i;
 }
-
-static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale)
-{
-    x86_reg i=0;
-
-    if (FFABS(scale) < 1024) {
-        scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);
-        __asm__ volatile(
-                "movd                %3, %%xmm2     \n\t"
-                "punpcklwd       %%xmm2, %%xmm2     \n\t"
-                "pshufd      $0, %%xmm2, %%xmm2     \n\t"
-                ".p2align 4                         \n\t"
-                "1:                                 \n\t"
-                "movdqa        (%1, %0), %%xmm0     \n\t"
-                "movdqa      16(%1, %0), %%xmm1     \n\t"
-                "pmulhrsw        %%xmm2, %%xmm0     \n\t"
-                "pmulhrsw        %%xmm2, %%xmm1     \n\t"
-                "paddw         (%2, %0), %%xmm0     \n\t"
-                "paddw       16(%2, %0), %%xmm1     \n\t"
-                "movdqa          %%xmm0, (%2, %0)   \n\t"
-                "movdqa          %%xmm1, 16(%2, %0) \n\t"
-                "add                $32, %0         \n\t"
-                "cmp               $128, %0         \n\t" // FIXME optimize & 
bench
-                " jb                 1b             \n\t"
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-                XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2")
-        );
-    } else {
-        for (i=0; i<8*8; i++) {
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - 
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
-
 #endif /* HAVE_SSSE3_INLINE */
 
 /* Draw the edges of width 'w' of an image of size width, height */
@@ -227,15 +193,17 @@ av_cold void 
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
             c->draw_edges = draw_edges_mmx;
         }
     }
+#endif /* HAVE_INLINE_ASM */
 
+    if (X86_SSSE3(cpu_flags)) {
 #if HAVE_SSSE3_INLINE
-    if (INLINE_SSSE3(cpu_flags)) {
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
             c->try_8x8basis = try_8x8basis_ssse3;
         }
-        c->add_8x8basis = add_8x8basis_ssse3;
-    }
 #endif /* HAVE_SSSE3_INLINE */
+#if HAVE_SSSE3_EXTERNAL
+        c->add_8x8basis = ff_add_8x8basis_ssse3;
+#endif
+    }
 
-#endif /* HAVE_INLINE_ASM */
 }

commit 0d3a88e55fc443640ed3c57c9fc906b1ed8a33b8
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 19:06:14 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    tests/checkasm/mpegvideoencdsp: Test denoise_dct
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/mpegvideoencdsp.c b/tests/checkasm/mpegvideoencdsp.c
index a4a4fa6f5c..955cd9f5b7 100644
--- a/tests/checkasm/mpegvideoencdsp.c
+++ b/tests/checkasm/mpegvideoencdsp.c
@@ -37,6 +37,37 @@
             buf[j] = rnd() % (max - min + 1) + min;      \
     } while (0)
 
+static void check_denoise_dct(MpegvideoEncDSPContext *c)
+{
+    declare_func(void, int16_t block[64], int dct_error_sum[64],
+                       const uint16_t dct_offset[64]);
+
+    if (check_func(c->denoise_dct, "denoise_dct")) {
+        DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+        DECLARE_ALIGNED(16, int16_t, block_new)[64];
+        DECLARE_ALIGNED(16, int, dct_error_sum_ref)[64];
+        DECLARE_ALIGNED(16, int, dct_error_sum_new)[64];
+        DECLARE_ALIGNED(16, uint16_t, dct_offset)[64];
+
+        for (size_t i = 0; i < FF_ARRAY_ELEMS(block_ref); ++i) {
+            unsigned random = rnd();
+            block_ref[i] = random & (1 << 16) ? random : 0;
+        }
+        randomize_buffers(dct_offset, sizeof(dct_offset));
+        randomize_buffer_clipped(dct_error_sum_ref, 0, (1 << 24) - 1);
+        memcpy(block_new, block_ref, sizeof(block_new));
+        memcpy(dct_error_sum_new, dct_error_sum_ref, 
sizeof(dct_error_sum_ref));
+
+        call_ref(block_ref, dct_error_sum_ref, dct_offset);
+        call_new(block_new, dct_error_sum_new, dct_offset);
+        if (memcmp(block_ref, block_new, sizeof(block_ref)) ||
+            memcmp(dct_error_sum_new, dct_error_sum_ref, 
sizeof(dct_error_sum_new)))
+            fail();
+
+        bench_new(block_new, dct_error_sum_new, dct_offset);
+    }
+}
+
 static void check_add_8x8basis(MpegvideoEncDSPContext *c)
 {
     declare_func(void, int16_t rem[64], const int16_t basis[64], int scale);
@@ -166,6 +197,8 @@ void checkasm_check_mpegvideoencdsp(void)
 
     ff_mpegvideoencdsp_init(&c, &avctx);
 
+    check_denoise_dct(&c);
+    report("denoise_dct");
     check_pix_sum(&c);
     report("pix_sum");
     check_pix_norm1(&c);

commit 1c00e094274b8571ea326311ff0425ba2dac0fd0
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 18:24:18 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    avcodec/mpegvideo_enc: Port denoise_dct to MpegvideoEncDSPContext
    
    It is very simple to remove the MPVEncContext from it.
    Notice that this also fixes a bug in x86/mpegvideoenc.c: It only
    used the SSE2 version of denoise_dct when dct_algo was auto or mmx
    (and it was therefore unused during FATE).
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/mips/Makefile b/libavcodec/mips/Makefile
index 4bbc2f00ea..1d777293d0 100644
--- a/libavcodec/mips/Makefile
+++ b/libavcodec/mips/Makefile
@@ -54,7 +54,6 @@ OBJS-$(CONFIG_BLOCKDSP)                   += 
mips/blockdsp_init_mips.o
 OBJS-$(CONFIG_PIXBLOCKDSP)                += mips/pixblockdsp_init_mips.o
 OBJS-$(CONFIG_IDCTDSP)                    += mips/idctdsp_init_mips.o
 OBJS-$(CONFIG_MPEGVIDEO)                  += mips/mpegvideo_init_mips.o
-OBJS-$(CONFIG_MPEGVIDEOENC)               += mips/mpegvideoenc_init_mips.o
 OBJS-$(CONFIG_MPEGVIDEOENCDSP)            += mips/mpegvideoencdsp_init_mips.o
 OBJS-$(CONFIG_ME_CMP)                     += mips/me_cmp_init_mips.o
 OBJS-$(CONFIG_MPEG4_DECODER)              += mips/xvididct_init_mips.o
@@ -100,7 +99,7 @@ MMI-OBJS-$(CONFIG_H264DSP)                += 
mips/h264dsp_mmi.o
 MMI-OBJS-$(CONFIG_H264CHROMA)             += mips/h264chroma_mmi.o
 MMI-OBJS-$(CONFIG_H264PRED)               += mips/h264pred_mmi.o
 MMI-OBJS-$(CONFIG_MPEGVIDEO)              += mips/mpegvideo_mmi.o
-MMI-OBJS-$(CONFIG_MPEGVIDEOENC)           += mips/mpegvideoenc_mmi.o
+MMI-OBJS-$(CONFIG_MPEGVIDEOENCDSP)        += mips/mpegvideoenc_mmi.o
 MMI-OBJS-$(CONFIG_IDCTDSP)                += mips/idctdsp_mmi.o           \
                                              mips/simple_idct_mmi.o
 MMI-OBJS-$(CONFIG_MPEG4_DECODER)          += mips/xvid_idct_mmi.o
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 72ffed6985..2a9ea4006e 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -22,7 +22,6 @@
 #define AVCODEC_MIPS_MPEGVIDEO_MIPS_H
 
 #include "libavcodec/mpegvideo.h"
-#include "libavcodec/mpegvideoenc.h"
 
 void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
         int n, int qscale);
@@ -34,6 +33,6 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, 
int16_t *block,
         int n, int qscale);
 void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
         int n, int qscale);
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block);
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64]);
 
 #endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideoenc_init_mips.c 
b/libavcodec/mips/mpegvideoenc_init_mips.c
deleted file mode 100644
index 7831973eb8..0000000000
--- a/libavcodec/mips/mpegvideoenc_init_mips.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- * Copyright (c) 2015 Manojkumar Bhosale ([email protected])
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include "libavutil/attributes.h"
-#include "libavutil/mips/cpu.h"
-#include "libavcodec/mpegvideoenc.h"
-#include "mpegvideo_mips.h"
-
-av_cold void ff_mpvenc_dct_init_mips(MPVEncContext *s)
-{
-    int cpu_flags = av_get_cpu_flags();
-
-    if (have_mmi(cpu_flags)) {
-        s->denoise_dct = ff_denoise_dct_mmi;
-    }
-}
diff --git a/libavcodec/mips/mpegvideoencdsp_init_mips.c 
b/libavcodec/mips/mpegvideoencdsp_init_mips.c
index 24a17b91db..df916282a2 100644
--- a/libavcodec/mips/mpegvideoencdsp_init_mips.c
+++ b/libavcodec/mips/mpegvideoencdsp_init_mips.c
@@ -23,12 +23,17 @@
 #include "libavcodec/bit_depth_template.c"
 #include "libavcodec/mpegvideoencdsp.h"
 #include "h263dsp_mips.h"
+#include "mpegvideo_mips.h"
 
 av_cold void ff_mpegvideoencdsp_init_mips(MpegvideoEncDSPContext *c,
                                           AVCodecContext *avctx)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_mmi(cpu_flags)) {
+        c->denoise_dct = ff_denoise_dct_mmi;
+    }
+
     if (have_msa(cpu_flags)) {
 #if BIT_DEPTH == 8
         c->pix_sum = ff_pix_sum_msa;
diff --git a/libavcodec/mips/mpegvideoenc_mmi.c 
b/libavcodec/mips/mpegvideoencdsp_mmi.c
similarity index 95%
rename from libavcodec/mips/mpegvideoenc_mmi.c
rename to libavcodec/mips/mpegvideoencdsp_mmi.c
index 085be3b0ec..2239a05978 100644
--- a/libavcodec/mips/mpegvideoenc_mmi.c
+++ b/libavcodec/mips/mpegvideoencdsp_mmi.c
@@ -25,17 +25,12 @@
 #include "mpegvideo_mips.h"
 #include "libavutil/mips/mmiutils.h"
 
-void ff_denoise_dct_mmi(MPVEncContext *s, int16_t *block)
+void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64])
 {
-    const int intra = s->c.mb_intra;
-    int *sum = s->dct_error_sum[intra];
-    uint16_t *offset = s->dct_offset[intra];
     double ftmp[8];
     mips_reg addr[1];
     DECLARE_VAR_ALL64;
 
-    s->dct_count[intra]++;
-
     __asm__ volatile(
         "pxor       %[ftmp0],   %[ftmp0],       %[ftmp0]                \n\t"
         "1:                                                             \n\t"
diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c
index ce0ee4bb68..9e83026b51 100644
--- a/libavcodec/mpegvideo_enc.c
+++ b/libavcodec/mpegvideo_enc.c
@@ -86,7 +86,6 @@
 static int encode_picture(MPVMainEncContext *const s, const AVPacket *pkt);
 static int dct_quantize_refine(MPVEncContext *const s, int16_t *block, int16_t 
*weight, int16_t *orig, int n, int qscale);
 static int sse_mb(MPVEncContext *const s);
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block);
 static int dct_quantize_c(MPVEncContext *const s,
                           int16_t *block, int n,
                           int qscale, int *overflow);
@@ -300,11 +299,8 @@ static av_cold void mpv_encode_defaults(MPVMainEncContext 
*const m)
 av_cold void ff_dct_encode_init(MPVEncContext *const s)
 {
     s->dct_quantize = dct_quantize_c;
-    s->denoise_dct  = denoise_dct_c;
 
-#if ARCH_MIPS
-    ff_mpvenc_dct_init_mips(s);
-#elif ARCH_X86
+#if ARCH_X86
     ff_dct_encode_init_x86(s);
 #endif
 
@@ -3955,29 +3951,14 @@ static int encode_picture(MPVMainEncContext *const m, 
const AVPacket *pkt)
     return 0;
 }
 
-static void denoise_dct_c(MPVEncContext *const s, int16_t *block)
+static inline void denoise_dct(MPVEncContext *const s, int16_t block[])
 {
-    const int intra = s->c.mb_intra;
-    int i;
+    if (!s->dct_error_sum)
+        return;
 
+    const int intra = s->c.mb_intra;
     s->dct_count[intra]++;
-
-    for(i=0; i<64; i++){
-        int level= block[i];
-
-        if(level){
-            if(level>0){
-                s->dct_error_sum[intra][i] += level;
-                level -= s->dct_offset[intra][i];
-                if(level<0) level=0;
-            }else{
-                s->dct_error_sum[intra][i] -= level;
-                level += s->dct_offset[intra][i];
-                if(level>0) level=0;
-            }
-            block[i]= level;
-        }
-    }
+    s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], 
s->dct_offset[intra]);
 }
 
 static int dct_quantize_trellis_c(MPVEncContext *const s,
@@ -4009,8 +3990,8 @@ static int dct_quantize_trellis_c(MPVEncContext *const s,
 
     s->fdsp.fdct(block);
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    denoise_dct(s, block);
+
     qmul= qscale*16;
     qadd= ((qscale-1)|1)*8;
 
@@ -4678,8 +4659,7 @@ static int dct_quantize_c(MPVEncContext *const s,
 
     s->fdsp.fdct(block);
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    denoise_dct(s, block);
 
     if (s->c.mb_intra) {
         scantable = s->c.intra_scantable.scantable;
diff --git a/libavcodec/mpegvideoenc.h b/libavcodec/mpegvideoenc.h
index ee115c3611..131908c10a 100644
--- a/libavcodec/mpegvideoenc.h
+++ b/libavcodec/mpegvideoenc.h
@@ -123,7 +123,6 @@ typedef struct MPVEncContext {
     uint16_t (*q_inter_matrix16)[2][64];
 
     /* noise reduction */
-    void (*denoise_dct)(struct MPVEncContext *s, int16_t *block);
     int (*dct_error_sum)[64];
     int dct_count[2];
     uint16_t (*dct_offset)[64];
@@ -397,7 +396,6 @@ int ff_mpv_reallocate_putbitbuffer(MPVEncContext *s, size_t 
threshold, size_t si
 void ff_write_quant_matrix(PutBitContext *pb, uint16_t *matrix);
 
 void ff_dct_encode_init(MPVEncContext *s);
-void ff_mpvenc_dct_init_mips(MPVEncContext *s);
 void ff_dct_encode_init_x86(MPVEncContext *s);
 
 void ff_convert_matrix(MPVEncContext *s, int (*qmat)[64], uint16_t 
(*qmat16)[2][64],
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index b4fd2af915..3b4a57d58a 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -28,6 +28,29 @@
 #include "mathops.h"
 #include "mpegvideoencdsp.h"
 
+static void denoise_dct_c(int16_t block[64], int dct_error_sum[64],
+                          const uint16_t dct_offset[64])
+{
+    for (int i = 0; i < 64; ++i) {
+        int level = block[i];
+
+        if (level) {
+            if (level > 0) {
+                dct_error_sum[i] += level;
+                level -= dct_offset[i];
+                if (level < 0)
+                    level = 0;
+            } else {
+                dct_error_sum[i] -= level;
+                level += dct_offset[i];
+                if (level > 0)
+                    level = 0;
+            }
+            block[i] = level;
+        }
+    }
+}
+
 static int try_8x8basis_c(const int16_t rem[64], const int16_t weight[64],
                           const int16_t basis[64], int scale)
 {
@@ -253,6 +276,8 @@ static void shrink88(uint8_t *dst, ptrdiff_t dst_wrap,
 av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                                      AVCodecContext *avctx)
 {
+    c->denoise_dct  = denoise_dct_c;
+
     c->try_8x8basis = try_8x8basis_c;
     c->add_8x8basis = add_8x8basis_c;
 
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 6ec665677b..989503f25f 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -30,6 +30,9 @@
 #define EDGE_BOTTOM 2
 
 typedef struct MpegvideoEncDSPContext {
+    void (*denoise_dct)(int16_t block[64], int dct_error_sum[64],
+                        const uint16_t dct_offset[64]);
+
     int (*try_8x8basis)(const int16_t rem[64], const int16_t weight[64],
                         const int16_t basis[64], int scale);
     void (*add_8x8basis)(int16_t rem[64], const int16_t basis[64], int scale);
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index e5665ac781..c667dcd2a2 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,22 +57,6 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 
 #endif /* HAVE_6REGS */
 
-#if HAVE_SSE2_EXTERNAL
-void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
-                             const uint16_t dct_offset[64]);
-
-static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
-{
-    const int intra = s->c.mb_intra;
-    int *sum= s->dct_error_sum[intra];
-    uint16_t *offset= s->dct_offset[intra];
-
-    s->dct_count[intra]++;
-
-    ff_mpv_denoise_dct_sse2(block, sum, offset);
-}
-#endif /* HAVE_SSE2_EXTERNAL */
-
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
     const int dct_algo = s->c.avctx->dct_algo;
@@ -83,9 +67,6 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
         if (INLINE_SSE2(cpu_flags)) {
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
-#endif
-#if HAVE_SSE2_EXTERNAL
-            s->denoise_dct  = denoise_dct_sse2;
 #endif
         }
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
diff --git a/libavcodec/x86/mpegvideoenc_template.c 
b/libavcodec/x86/mpegvideoenc_template.c
index f0b95c1621..14e993de2b 100644
--- a/libavcodec/x86/mpegvideoenc_template.c
+++ b/libavcodec/x86/mpegvideoenc_template.c
@@ -76,8 +76,11 @@ static int RENAME(dct_quantize)(MPVEncContext *const s,
     //s->fdct (block);
     ff_fdct_sse2(block); // cannot be anything else ...
 
-    if(s->dct_error_sum)
-        s->denoise_dct(s, block);
+    if (s->dct_error_sum) {
+        const int intra = s->c.mb_intra;
+        s->dct_count[intra]++;
+        s->mpvencdsp.denoise_dct(block, s->dct_error_sum[intra], 
s->dct_offset[intra]);
+    }
 
     if (s->c.mb_intra) {
         int dummy;
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
index bf5b722016..f6169b5399 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -27,6 +27,8 @@
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
 
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+                             const uint16_t dct_offset[64]);
 int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);
 int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
@@ -209,6 +211,7 @@ av_cold void 
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_SSE2(cpu_flags)) {
+        c->denoise_dct = ff_mpv_denoise_dct_sse2;
         c->pix_sum     = ff_pix_sum16_sse2;
         c->pix_norm1   = ff_pix_norm1_sse2;
     }

commit d633fa0433de093c9a1257aed519b806b1054f21
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 17:32:29 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    avcodec/x86/mpegvideoenc: Port denoise_dct_sse2 to external assembly
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index 2ca05f69ea..e5665ac781 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -57,8 +57,10 @@ DECLARE_ALIGNED(16, static const uint16_t, 
inv_zigzag_direct16)[64] = {
 
 #endif /* HAVE_6REGS */
 
-#if HAVE_INLINE_ASM
-#if HAVE_SSE2_INLINE
+#if HAVE_SSE2_EXTERNAL
+void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],
+                             const uint16_t dct_offset[64]);
+
 static void denoise_dct_sse2(MPVEncContext *const s, int16_t block[])
 {
     const int intra = s->c.mb_intra;
@@ -67,56 +69,9 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t 
block[])
 
     s->dct_count[intra]++;
 
-    __asm__ volatile(
-        "pxor %%xmm6, %%xmm6                    \n\t"
-        "1:                                     \n\t"
-        "pxor %%xmm0, %%xmm0                    \n\t"
-        "pxor %%xmm1, %%xmm1                    \n\t"
-        "movdqa (%0), %%xmm2                    \n\t"
-        "movdqa 16(%0), %%xmm3                  \n\t"
-        "pcmpgtw %%xmm2, %%xmm0                 \n\t"
-        "pcmpgtw %%xmm3, %%xmm1                 \n\t"
-        "pxor %%xmm0, %%xmm2                    \n\t"
-        "pxor %%xmm1, %%xmm3                    \n\t"
-        "psubw %%xmm0, %%xmm2                   \n\t"
-        "psubw %%xmm1, %%xmm3                   \n\t"
-        "movdqa %%xmm2, %%xmm4                  \n\t"
-        "movdqa %%xmm3, %%xmm5                  \n\t"
-        "psubusw (%2), %%xmm2                   \n\t"
-        "psubusw 16(%2), %%xmm3                 \n\t"
-        "pxor %%xmm0, %%xmm2                    \n\t"
-        "pxor %%xmm1, %%xmm3                    \n\t"
-        "psubw %%xmm0, %%xmm2                   \n\t"
-        "psubw %%xmm1, %%xmm3                   \n\t"
-        "movdqa %%xmm2, (%0)                    \n\t"
-        "movdqa %%xmm3, 16(%0)                  \n\t"
-        "movdqa %%xmm4, %%xmm2                  \n\t"
-        "movdqa %%xmm5, %%xmm0                  \n\t"
-        "punpcklwd %%xmm6, %%xmm4               \n\t"
-        "punpckhwd %%xmm6, %%xmm2               \n\t"
-        "punpcklwd %%xmm6, %%xmm5               \n\t"
-        "punpckhwd %%xmm6, %%xmm0               \n\t"
-        "paddd (%1), %%xmm4                     \n\t"
-        "paddd 16(%1), %%xmm2                   \n\t"
-        "paddd 32(%1), %%xmm5                   \n\t"
-        "paddd 48(%1), %%xmm0                   \n\t"
-        "movdqa %%xmm4, (%1)                    \n\t"
-        "movdqa %%xmm2, 16(%1)                  \n\t"
-        "movdqa %%xmm5, 32(%1)                  \n\t"
-        "movdqa %%xmm0, 48(%1)                  \n\t"
-        "add $32, %0                            \n\t"
-        "add $64, %1                            \n\t"
-        "add $32, %2                            \n\t"
-        "cmp %3, %0                             \n\t"
-            " jb 1b                             \n\t"
-        : "+r" (block), "+r" (sum), "+r" (offset)
-        : "r"(block+64)
-          XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                            "%xmm4", "%xmm5", "%xmm6")
-    );
+    ff_mpv_denoise_dct_sse2(block, sum, offset);
 }
-#endif /* HAVE_SSE2_INLINE */
-#endif /* HAVE_INLINE_ASM */
+#endif /* HAVE_SSE2_EXTERNAL */
 
 av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 {
@@ -129,7 +84,9 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const s)
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
 #endif
+#if HAVE_SSE2_EXTERNAL
             s->denoise_dct  = denoise_dct_sse2;
+#endif
         }
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
diff --git a/libavcodec/x86/mpegvideoencdsp.asm 
b/libavcodec/x86/mpegvideoencdsp.asm
index d12646ae54..0e86a5304c 100644
--- a/libavcodec/x86/mpegvideoencdsp.asm
+++ b/libavcodec/x86/mpegvideoencdsp.asm
@@ -24,6 +24,52 @@
 %include "libavutil/x86/x86util.asm"
 
 SECTION .text
+
+INIT_XMM sse2
+cglobal mpv_denoise_dct, 3, 4, 7, block, sum, offset
+    pxor            m6, m6
+    lea             r3, [sumq+256]
+.loop:
+    mova            m2, [blockq]
+    mova            m3, [blockq+16]
+    mova            m0, m6
+    mova            m1, m6
+    pcmpgtw         m0, m2
+    pcmpgtw         m1, m3
+    pxor            m2, m0
+    pxor            m3, m1
+    psubw           m2, m0
+    psubw           m3, m1
+    psubusw         m4, m2, [offsetq]
+    psubusw         m5, m3, [offsetq+16]
+    pxor            m4, m0
+    pxor            m5, m1
+    add        offsetq, 32
+    psubw           m4, m0
+    psubw           m5, m1
+    mova      [blockq], m4
+    mova   [blockq+16], m5
+    mova            m0, m2
+    mova            m1, m3
+    add         blockq, 32
+    punpcklwd       m0, m6
+    punpckhwd       m2, m6
+    punpcklwd       m1, m6
+    punpckhwd       m3, m6
+    paddd           m0, [sumq]
+    paddd           m2, [sumq+16]
+    paddd           m1, [sumq+32]
+    paddd           m3, [sumq+48]
+    mova        [sumq], m0
+    mova     [sumq+16], m2
+    mova     [sumq+32], m1
+    mova     [sumq+48], m3
+    add           sumq, 64
+    cmp           sumq, r3
+    jb           .loop
+    RET
+
+
 ; int ff_pix_sum16(const uint8_t *pix, ptrdiff_t line_size)
 ; %1 = number of loops
 ; %2 = number of GPRs used

commit 2cfef7031ca4620e4744534527fe1674963bfdda
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 16:46:18 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    avcodec/x86/mpegvideoenc: Reduce number of registers used
    
    Avoids a push+pop on x64 Windows.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index bb1d2cc319..2ca05f69ea 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -68,7 +68,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, int16_t 
block[])
     s->dct_count[intra]++;
 
     __asm__ volatile(
-        "pxor %%xmm7, %%xmm7                    \n\t"
+        "pxor %%xmm6, %%xmm6                    \n\t"
         "1:                                     \n\t"
         "pxor %%xmm0, %%xmm0                    \n\t"
         "pxor %%xmm1, %%xmm1                    \n\t"
@@ -90,18 +90,18 @@ static void denoise_dct_sse2(MPVEncContext *const s, 
int16_t block[])
         "psubw %%xmm1, %%xmm3                   \n\t"
         "movdqa %%xmm2, (%0)                    \n\t"
         "movdqa %%xmm3, 16(%0)                  \n\t"
-        "movdqa %%xmm4, %%xmm6                  \n\t"
+        "movdqa %%xmm4, %%xmm2                  \n\t"
         "movdqa %%xmm5, %%xmm0                  \n\t"
-        "punpcklwd %%xmm7, %%xmm4               \n\t"
-        "punpckhwd %%xmm7, %%xmm6               \n\t"
-        "punpcklwd %%xmm7, %%xmm5               \n\t"
-        "punpckhwd %%xmm7, %%xmm0               \n\t"
+        "punpcklwd %%xmm6, %%xmm4               \n\t"
+        "punpckhwd %%xmm6, %%xmm2               \n\t"
+        "punpcklwd %%xmm6, %%xmm5               \n\t"
+        "punpckhwd %%xmm6, %%xmm0               \n\t"
         "paddd (%1), %%xmm4                     \n\t"
-        "paddd 16(%1), %%xmm6                   \n\t"
+        "paddd 16(%1), %%xmm2                   \n\t"
         "paddd 32(%1), %%xmm5                   \n\t"
         "paddd 48(%1), %%xmm0                   \n\t"
         "movdqa %%xmm4, (%1)                    \n\t"
-        "movdqa %%xmm6, 16(%1)                  \n\t"
+        "movdqa %%xmm2, 16(%1)                  \n\t"
         "movdqa %%xmm5, 32(%1)                  \n\t"
         "movdqa %%xmm0, 48(%1)                  \n\t"
         "add $32, %0                            \n\t"
@@ -112,7 +112,7 @@ static void denoise_dct_sse2(MPVEncContext *const s, 
int16_t block[])
         : "+r" (block), "+r" (sum), "+r" (offset)
         : "r"(block+64)
           XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
-                            "%xmm4", "%xmm5", "%xmm6", "%xmm7")
+                            "%xmm4", "%xmm5", "%xmm6")
     );
 }
 #endif /* HAVE_SSE2_INLINE */

commit 503afa40f7d6227ec25d42d40275f810940b0959
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 15 16:18:16 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Tue Nov 18 20:41:12 2025 +0100

    avcodec/x86/mpegvideoenc: Remove check for MMX
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index eac9947590..bb1d2cc319 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -123,16 +123,14 @@ av_cold void ff_dct_encode_init_x86(MPVEncContext *const 
s)
     const int dct_algo = s->c.avctx->dct_algo;
 
     if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
-        int cpu_flags = av_get_cpu_flags();
 #if HAVE_SSE2_INLINE
+        int cpu_flags = av_get_cpu_flags();
         if (INLINE_SSE2(cpu_flags)) {
 #if HAVE_6REGS
             s->dct_quantize = dct_quantize_sse2;
 #endif
             s->denoise_dct  = denoise_dct_sse2;
         }
-#endif
 #if HAVE_6REGS && HAVE_SSSE3_INLINE
         if (INLINE_SSSE3(cpu_flags))
             s->dct_quantize = dct_quantize_ssse3;

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/mips/Makefile                           |  3 +-
 libavcodec/mips/mpegvideo_mips.h                   |  3 +-
 libavcodec/mips/mpegvideoenc_init_mips.c           | 33 --------
 libavcodec/mips/mpegvideoencdsp_init_mips.c        |  5 ++
 .../{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c}  |  7 +-
 libavcodec/mpegvideo_enc.c                         | 38 ++-------
 libavcodec/mpegvideoenc.h                          |  2 -
 libavcodec/mpegvideoencdsp.c                       | 25 ++++++
 libavcodec/mpegvideoencdsp.h                       |  3 +
 libavcodec/x86/mpegvideoenc.c                      | 74 +---------------
 libavcodec/x86/mpegvideoenc_template.c             | 44 +++++-----
 libavcodec/x86/mpegvideoencdsp.asm                 | 98 ++++++++++++++++++++++
 libavcodec/x86/mpegvideoencdsp_init.c              | 49 +++--------
 libavutil/x86/asm.h                                |  5 +-
 tests/checkasm/mpegvideoencdsp.c                   | 33 ++++++++
 15 files changed, 212 insertions(+), 210 deletions(-)
 delete mode 100644 libavcodec/mips/mpegvideoenc_init_mips.c
 rename libavcodec/mips/{mpegvideoenc_mmi.c => mpegvideoencdsp_mmi.c} (95%)


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. 5bf57a925c avutil/x86/asm: Remove wrong comment, rename FF_REG_sp

Reply via email to