[FFmpeg-cvslog] [ffmpeg] branch master updated. c22c2c5e03 avcodec/mpegvideo: Port dct_unquantize_mpeg2_intra_mmx to SSE2

ffmpeg-git--- via ffmpeg-cvslog Wed, 03 Dec 2025 01:50:45 -0800

The branch, master has been updated
       via  c22c2c5e033e3f73df47d88e49df86206f298e46 (commit)
       via  6e2153111d5ff3b21a5303b7c23dd29de8a3bda6 (commit)
       via  60084b136916a4dcace41e75a3b873e77eebd648 (commit)
       via  1cb987d25bf4c8214461e12b01864b23c9bae67c (commit)
       via  a9a23925dfcf781dedc9cb910dd3097dd6224104 (commit)
       via  1fa8ffc1db2b62e475545bc6b117215704f9e1d8 (commit)
       via  6d56807a06ce06712c65f8fcbf2a9a444bf59353 (commit)
       via  0f7cc6aeeacba070d6d4b76a9f3a4d4036c3bb0b (commit)
       via  357fc5243c32300bba91c096488e86558beed4c8 (commit)
       via  581050a1755b335cb106ad1b6c8e5f6fa9c19bd0 (commit)
       via  e7a629049f7e9be397b0acabe75beb207ad9dc21 (commit)
       via  5d41d3e21dff14058b283491480a7382daeb5da9 (commit)
       via  011ef7fc65fcbf2141adcec9ca805874bb0a6a16 (commit)
       via  358c569b05bc6f9a107a5caebcc8da56e8bf9799 (commit)
      from  f7551e7505d389fcc14a8e16bcd13ab770658990 (commit)



- Log -----------------------------------------------------------------
commit c22c2c5e033e3f73df47d88e49df86206f298e46
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 06:48:19 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/mpegvideo: Port dct_unquantize_mpeg2_intra_mmx to SSE2
    
    Benefits from wider registers.
    
    Benchmarks:
    dct_unquantize_mpeg2_intra_c:                          228.2 ( 1.00x)
    dct_unquantize_mpeg2_intra_mmx:                         28.2 ( 8.10x)
    dct_unquantize_mpeg2_intra_sse2:                        18.4 (12.37x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 576f8f320f..7c137cf75e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -28,7 +28,7 @@
 #include "libavcodec/mpegvideodata.h"
 #include "libavcodec/mpegvideo_unquantize.h"
 
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
 
 #define SPLATW(reg) "punpcklwd    %%" #reg ", %%" #reg "\n\t" \
                     "pshufd   $0, %%" #reg ", %%" #reg "\n\t"
@@ -250,8 +250,8 @@ __asm__ volatile(
 
 #endif /* HAVE_SSSE3_INLINE */
 
-static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_sse2(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -271,35 +271,35 @@ static void dct_unquantize_mpeg2_intra_mmx(const 
MPVContext *s,
     quant_matrix = s->intra_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "movq %%mm0, %%mm2              \n\t"
-                "movq %%mm1, %%mm3              \n\t"
-                "psrlw $12, %%mm2               \n\t" // block[i] < 0 ? 0xf : 0
-                "psrlw $12, %%mm3               \n\t" // (block[i] is in the 
-2048..2047 range)
-                "pmullw %%mm4, %%mm0            \n\t" // block[i]*q
-                "pmullw %%mm5, %%mm1            \n\t" // block[i]*q
-                "paddw %%mm2, %%mm0             \n\t" // bias negative block[i]
-                "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
-                "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
-                "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
-                "movq %%mm0, (%1, %0)           \n\t"
-                "movq %%mm1, 8(%1, %0)          \n\t"
-
-                "add $16, %0                    \n\t"
-                "jng 1b                         \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "movdqa     %%xmm0, %%xmm2     \n\t"
+                "movdqa     %%xmm1, %%xmm3     \n\t"
+                "psrlw         $12, %%xmm2     \n\t" // block[i] < 0 ? 0xf : 0
+                "psrlw         $12, %%xmm3     \n\t" // (block[i] is in the 
-2048..2047 range)
+                "pmullw     %%xmm4, %%xmm0     \n\t" // block[i]*q
+                "pmullw     %%xmm5, %%xmm1     \n\t" // block[i]*q
+                "paddw      %%xmm2, %%xmm0     \n\t" // bias negative block[i]
+                "paddw      %%xmm3, %%xmm1     \n\t" // so that a right-shift
+                "psraw          $4, %%xmm0     \n\t" // is equivalent to divide
+                "psraw          $4, %%xmm1     \n\t" // with rounding towards 
zero
+                "movdqa     %%xmm0, (%1, %0)   \n\t"
+                "movdqa     %%xmm1, 16(%1, %0) \n\t"
+
+                "add           $32, %0         \n\t"
+                "jng 1b                        \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6",)
+                  "memory"
         );
     block[0]= block0;
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
@@ -371,16 +371,16 @@ __asm__ volatile(
 }
 
 #endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 
 av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
 {
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
+    if (INLINE_SSE2(cpu_flags)) {
         if (!bitexact)
-            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
     }
 #if HAVE_SSSE3_INLINE
     if (INLINE_SSSE3(cpu_flags)) {
@@ -391,5 +391,5 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 }
diff --git a/tests/checkasm/mpegvideo_unquantize.c 
b/tests/checkasm/mpegvideo_unquantize.c
index 837606e60e..220a743a96 100644
--- a/tests/checkasm/mpegvideo_unquantize.c
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -215,7 +215,7 @@ void checkasm_check_mpegvideo_unquantize(void)
     int q_scale_type = rnd() & 1;
 
     ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
-    declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block, 
int n, int qscale);
+    declare_func(void, MPVContext *s, int16_t *block, int n, int qscale);
 
     for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
         void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =

commit 6e2153111d5ff3b21a5303b7c23dd29de8a3bda6
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 06:45:12 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Port dct_unquantize_mpeg2_inter_mmx to SSSE3
    
    Benefits from wider registers, pabsw and psignw.
    
    Benchmarks:
    dct_unquantize_mpeg2_inter_c:                          131.2 ( 1.00x)
    dct_unquantize_mpeg2_inter_mmx:                         50.2 ( 2.62x)
    dct_unquantize_mpeg2_inter_ssse3:                       20.5 ( 6.38x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 01048df47d..576f8f320f 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -305,8 +305,10 @@ __asm__ volatile(
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
 }
 
-static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_mpeg2_inter_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     av_assert2(s->block_last_index[n]>=0);
 
@@ -316,72 +318,59 @@ static void dct_unquantize_mpeg2_inter_mmx(const 
MPVContext *s,
 
 
 __asm__ volatile(
-                "movd          %k1, %%mm6      \n\t"
+                "movd          %k1, %%xmm6     \n\t"
                 "lea      (%2, %0), %1         \n\t"
                 "neg            %0             \n\t"
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlq $48, %%mm7               \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%1, %0), %%mm0      \n\t"
-                "movq    8(%1, %0), %%mm1      \n\t"
-                "movq     (%3, %0), %%mm4      \n\t"
-                "movq    8(%3, %0), %%mm5      \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
-                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
-                "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 
1)*q
-                "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 
1)*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%1, %0), %%mm4      \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5      \n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $5, %%mm0                \n\t"
-                "psrlw $5, %%mm1                \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "pxor %%mm4, %%mm7              \n\t"
-                "pxor %%mm5, %%mm7              \n\t"
-                "movq        %%mm4, (%1, %0)   \n\t"
-                "movq        %%mm5, 8(%1, %0)  \n\t"
-
-                "add           $16, %0          \n\t"
-                "jng 1b                         \n\t"
-                "movd      124(%2), %%mm0      \n\t"
-                "movq %%mm7, %%mm6              \n\t"
-                "psrlq $32, %%mm7               \n\t"
-                "pxor %%mm6, %%mm7              \n\t"
-                "movq %%mm7, %%mm6              \n\t"
-                "psrlq $16, %%mm7               \n\t"
-                "pxor %%mm6, %%mm7              \n\t"
-                "pslld $31, %%mm7               \n\t"
-                "psrlq $15, %%mm7               \n\t"
-                "pxor %%mm7, %%mm0              \n\t"
-                "movd        %%mm0, 124(%2)    \n\t"
+                SPLATW(xmm6)
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrldq        $14, %%xmm7     \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%3, %0), %%xmm4     \n\t"
+                "movdqa 16(%3, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
+                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
+                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*2*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*2*q
+                "paddw      %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
+                "paddw      %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
+                "psrlw          $5, %%xmm2     \n\t"
+                "psrlw          $5, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
+                "pxor       %%xmm2, %%xmm7     \n\t"
+                "pxor       %%xmm3, %%xmm7     \n\t"
+
+                "add           $32, %0         \n\t"
+                "jng 1b                        \n\t"
+                "movd      124(%2), %%xmm0     \n\t"
+                "movhlps    %%xmm7, %%xmm6     \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pshufd $1, %%xmm7, %%xmm6     \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pshuflw $1, %%xmm7, %%xmm6    \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pslld         $31, %%xmm7     \n\t"
+                "psrld         $15, %%xmm7     \n\t"
+                "pxor       %%xmm7, %%xmm0     \n\t"
+                "movd       %%xmm0, 124(%2)    \n\t"
 
                 : "+r"(offset), "+r" (qscale2)
                 : "r" (block), "r"(quant_matrix)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
 }
 
+#endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
@@ -392,7 +381,6 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     if (INLINE_MMX(cpu_flags)) {
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
-        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
 #if HAVE_SSSE3_INLINE
     if (INLINE_SSSE3(cpu_flags)) {
@@ -400,6 +388,7 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
         s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
+        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */

commit 60084b136916a4dcace41e75a3b873e77eebd648
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 3 19:45:49 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Port MPEG-1 unquantize functions to SSSE3
    
    Benefits from wider registers and pabsw, psignw.
    
    Benchmarks:
    dct_unquantize_mpeg1_inter_c:                          343.0 ( 1.00x)
    dct_unquantize_mpeg1_inter_mmx:                         50.6 ( 6.78x)
    dct_unquantize_mpeg1_inter_ssse3:                       17.2 (19.94x)
    dct_unquantize_mpeg1_intra_c:                          352.1 ( 1.00x)
    dct_unquantize_mpeg1_intra_mmx:                         48.8 ( 7.22x)
    dct_unquantize_mpeg1_intra_ssse3:                       19.5 (18.03x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 758bf57ab9..6aff5fbcd0 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -38,6 +38,8 @@
 #include "qpeldsp.h"
 #include "videodsp.h"
 
+#include "libavutil/mem_internal.h"
+
 #define MAX_THREADS 32
 
 /**
@@ -202,10 +204,10 @@ typedef struct MpegEncContext {
     int *mb_index2xy;        ///< mb_index -> mb_x + mb_y*mb_stride
 
     /** matrix transmitted in the bitstream */
-    uint16_t intra_matrix[64];
-    uint16_t chroma_intra_matrix[64];
-    uint16_t inter_matrix[64];
-    uint16_t chroma_inter_matrix[64];
+    DECLARE_ALIGNED(16, uint16_t, intra_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, chroma_intra_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, inter_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, chroma_inter_matrix)[64];
 
     /* error concealment / resync */
     int resync_mb_x;                 ///< x position of last resync marker
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 82a29d1bcf..01048df47d 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -138,10 +138,9 @@ __asm__ volatile(
                 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
 }
-#endif
 
-static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -159,59 +158,45 @@ static void dct_unquantize_mpeg1_intra_mmx(const 
MPVContext *s,
     quant_matrix = s->intra_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
-                "psubw %%mm7, %%mm0             \n\t"
-                "psubw %%mm7, %%mm1             \n\t"
-                "por %%mm7, %%mm0               \n\t"
-                "por %%mm7, %%mm1               \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%1, %0)           \n\t"
-                "movq %%mm5, 8(%1, %0)          \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrlw         $15, %%xmm7     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*q
+                "psraw          $3, %%xmm2     \n\t"
+                "psraw          $3, %%xmm3     \n\t"
+                "psubw      %%xmm7, %%xmm2     \n\t"
+                "psubw      %%xmm7, %%xmm3     \n\t"
+                "por        %%xmm7, %%xmm2     \n\t"
+                "por        %%xmm7, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "add $16, %0                    \n\t"
-                "js 1b                          \n\t"
+                "add           $32, %0         \n\t"
+                "js 1b                         \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
     block[0]= block0;
 }
 
-static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -223,60 +208,48 @@ static void dct_unquantize_mpeg1_inter_mmx(const 
MPVContext *s,
         quant_matrix = s->inter_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
-                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
-                "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
-                "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
-                "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 
1)*q
-                "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 
1)*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
-                "psraw $4, %%mm0                \n\t"
-                "psraw $4, %%mm1                \n\t"
-                "psubw %%mm7, %%mm0             \n\t"
-                "psubw %%mm7, %%mm1             \n\t"
-                "por %%mm7, %%mm0               \n\t"
-                "por %%mm7, %%mm1               \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%1, %0)           \n\t"
-                "movq %%mm5, 8(%1, %0)          \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrlw         $15, %%xmm7     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
+                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
+                "paddw      %%xmm7, %%xmm2     \n\t" // abs(block[i])*2 + 1
+                "paddw      %%xmm7, %%xmm3     \n\t" // abs(block[i])*2 + 1
+                "pmullw     %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
+                "psraw          $4, %%xmm2     \n\t"
+                "psraw          $4, %%xmm3     \n\t"
+                "psubw      %%xmm7, %%xmm2     \n\t"
+                "psubw      %%xmm7, %%xmm3     \n\t"
+                "por        %%xmm7, %%xmm2     \n\t"
+                "por        %%xmm7, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "add $16, %0                    \n\t"
-                "js 1b                          \n\t"
+                "add           $32, %0         \n\t"
+                "js 1b                         \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
 }
 
+#endif /* HAVE_SSSE3_INLINE */
+
 static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
 {
@@ -417,8 +390,6 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
-        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
@@ -427,6 +398,8 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     if (INLINE_SSSE3(cpu_flags)) {
         s->dct_unquantize_h263_intra  = dct_unquantize_h263_intra_ssse3;
         s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
+        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
+        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */

commit 1cb987d25bf4c8214461e12b01864b23c9bae67c
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Nov 4 07:53:09 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Port dct_unquantize_h263_{intra,inter}_mmx to SSSE3
    
    It benefits from wider registers and psignw.
    
    Benchmarks:
    dct_unquantize_h263_inter_c:                            88.3 ( 1.00x)
    dct_unquantize_h263_inter_mmx:                          24.7 ( 3.58x)
    dct_unquantize_h263_inter_ssse3:                         9.3 ( 9.47x)
    dct_unquantize_h263_intra_c:                            93.7 ( 1.00x)
    dct_unquantize_h263_intra_mmx:                          30.6 ( 3.06x)
    dct_unquantize_h263_intra_ssse3:                        16.5 ( 5.69x)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index aa15e2b32a..82a29d1bcf 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,13 @@
 
 #if HAVE_MMX_INLINE
 
-static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
-                                          int16_t *block, int n, int qscale)
+#define SPLATW(reg) "punpcklwd    %%" #reg ", %%" #reg "\n\t" \
+                    "pshufd   $0, %%" #reg ", %%" #reg "\n\t"
+
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_h263_intra_ssse3(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     x86_reg qmul = (unsigned)qscale << 1;
     int level, qadd;
@@ -51,61 +56,45 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext 
*s,
     x86_reg offset = s->ac_pred ? 63 << 1 : 
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
 
 __asm__ volatile(
-                "movd          %k1, %%mm6       \n\t" //qmul
-                "lea      (%2, %0), %1          \n\t"
-                "neg            %0              \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "movd           %3, %%mm5       \n\t" //qadd
-                "pxor %%mm7, %%mm7              \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "psubw %%mm5, %%mm7             \n\t"
-                "pxor %%mm4, %%mm4              \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%1, %0), %%mm0       \n\t"
-                "movq    8(%1, %0), %%mm1       \n\t"
-
-                "pmullw %%mm6, %%mm0            \n\t"
-                "pmullw %%mm6, %%mm1            \n\t"
+                "movd          %k1, %%xmm0     \n\t" //qmul
+                "lea      (%2, %0), %1         \n\t"
+                "neg            %0             \n\t"
+                "movd           %3, %%xmm1     \n\t" //qadd
+                SPLATW(xmm0)
+                SPLATW(xmm1)
 
-                "movq     (%1, %0), %%mm2       \n\t"
-                "movq    8(%1, %0), %%mm3       \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%1, %0), %%xmm2     \n\t"
+                "movdqa 16(%1, %0), %%xmm3     \n\t"
 
-                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "movdqa     %%xmm1, %%xmm4     \n\t"
+                "movdqa     %%xmm1, %%xmm5     \n\t"
 
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
+                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
+                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd
 
-                "paddw %%mm7, %%mm0             \n\t"
-                "paddw %%mm7, %%mm1             \n\t"
+                "pmullw     %%xmm0, %%xmm2     \n\t"
+                "pmullw     %%xmm0, %%xmm3     \n\t"
 
-                "pxor %%mm0, %%mm2              \n\t"
-                "pxor %%mm1, %%mm3              \n\t"
+                "paddw      %%xmm4, %%xmm2     \n\t"
+                "paddw      %%xmm5, %%xmm3     \n\t"
 
-                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "pandn %%mm2, %%mm0             \n\t"
-                "pandn %%mm3, %%mm1             \n\t"
-
-                "movq        %%mm0, (%1, %0)    \n\t"
-                "movq        %%mm1, 8(%1, %0)   \n\t"
-
-                "add           $16, %0          \n\t"
-                "jng 1b                         \n\t"
+                "add           $32, %0         \n\t"
+                "jng            1b             \n\t"
                 : "+r"(offset), "+r"(qmul)
                 : "r" (block), "rm" (qadd)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
         block[0]= level;
 }
 
 
-static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
-                                          int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_ssse3(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     int qmul = qscale << 1;
     int qadd = (qscale - 1) | 1;
@@ -115,56 +104,41 @@ static void dct_unquantize_h263_inter_mmx(const 
MPVContext *s,
     x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 
1;
 
 __asm__ volatile(
-                "movd           %2, %%mm6       \n\t" //qmul
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "movd           %3, %%mm5       \n\t" //qadd
-                "add            %1, %0          \n\t"
-                "neg            %1              \n\t"
-                "pxor %%mm7, %%mm7              \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "psubw %%mm5, %%mm7             \n\t"
-                "pxor %%mm4, %%mm4              \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%0, %1), %%mm0       \n\t"
-                "movq    8(%0, %1), %%mm1       \n\t"
+                "movd           %2, %%xmm0     \n\t" //qmul
+                "movd           %3, %%xmm1     \n\t" //qadd
+                "add            %1, %0         \n\t"
+                "neg            %1             \n\t"
+                SPLATW(xmm0)
+                SPLATW(xmm1)
 
-                "pmullw %%mm6, %%mm0            \n\t"
-                "pmullw %%mm6, %%mm1            \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%0, %1), %%xmm2     \n\t"
+                "movdqa 16(%0, %1), %%xmm3     \n\t"
 
-                "movq     (%0, %1), %%mm2       \n\t"
-                "movq    8(%0, %1), %%mm3       \n\t"
+                "movdqa     %%xmm1, %%xmm4     \n\t"
+                "movdqa     %%xmm1, %%xmm5     \n\t"
 
-                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
+                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd
 
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-
-                "paddw %%mm7, %%mm0             \n\t"
-                "paddw %%mm7, %%mm1             \n\t"
-
-                "pxor %%mm0, %%mm2              \n\t"
-                "pxor %%mm1, %%mm3              \n\t"
-
-                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
+                "pmullw     %%xmm0, %%xmm2     \n\t"
+                "pmullw     %%xmm0, %%xmm3     \n\t"
 
-                "pandn %%mm2, %%mm0             \n\t"
-                "pandn %%mm3, %%mm1             \n\t"
+                "paddw      %%xmm4, %%xmm2     \n\t"
+                "paddw      %%xmm5, %%xmm3     \n\t"
 
-                "movq        %%mm0, (%0, %1)    \n\t"
-                "movq        %%mm1, 8(%0, %1)   \n\t"
+                "movdqa     %%xmm2, (%0, %1)   \n\t"
+                "movdqa     %%xmm3, 16(%0, %1) \n\t"
 
-                "add           $16, %1          \n\t"
-                "jng 1b                         \n\t"
+                "add           $32, %1         \n\t"
+                "jng 1b                        \n\t"
                 : "+r" (block), "+r" (offset)
                 : "rm"(qmul), "rm" (qadd)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
 }
+#endif
 
 static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
@@ -443,13 +417,17 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
-        s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
+#if HAVE_SSSE3_INLINE
+    if (INLINE_SSSE3(cpu_flags)) {
+        s->dct_unquantize_h263_intra  = dct_unquantize_h263_intra_ssse3;
+        s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
+    }
+#endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
 }

commit a9a23925dfcf781dedc9cb910dd3097dd6224104
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Nov 3 19:17:16 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Don't duplicate register
    
    Currently several inline ASM blocks used a value as
    an input and rax as clobber register. The input value
    was just moved into the register which then served as loop
    counter. This is wasteful, as one can just use the value's
    register directly as loop counter.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index d1614eb1eb..aa15e2b32a 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -183,19 +183,19 @@ static void dct_unquantize_mpeg1_intra_mmx(const 
MPVContext *s,
         block0 = block[0] * s->c_dc_scale;
     /* XXX: only MPEG-1 */
     quant_matrix = s->intra_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlw $15, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -210,8 +210,8 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
                 "psraw $3, %%mm0                \n\t"
                 "psraw $3, %%mm1                \n\t"
                 "psubw %%mm7, %%mm0             \n\t"
@@ -224,13 +224,14 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%1, %0)           \n\t"
+                "movq %%mm5, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "js 1b                          \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
     block[0]= block0;
 }
@@ -246,19 +247,19 @@ static void dct_unquantize_mpeg1_inter_mmx(const 
MPVContext *s,
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
         quant_matrix = s->inter_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlw $15, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -277,8 +278,8 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 
1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
                 "psraw $4, %%mm0                \n\t"
                 "psraw $4, %%mm1                \n\t"
                 "psubw %%mm7, %%mm0             \n\t"
@@ -291,13 +292,14 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%1, %0)           \n\t"
+                "movq %%mm5, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "js 1b                          \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
 }
 
@@ -320,17 +322,17 @@ static void dct_unquantize_mpeg2_intra_mmx(const 
MPVContext *s,
     else
         block0 = block[0] * s->c_dc_scale;
     quant_matrix = s->intra_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "movq %%mm0, %%mm2              \n\t"
@@ -343,13 +345,14 @@ __asm__ volatile(
                 "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
                 "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
                 "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
-                "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm0, (%1, %0)           \n\t"
+                "movq %%mm1, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
     block[0]= block0;
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
@@ -358,30 +361,27 @@ __asm__ volatile(
 static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
 {
-    x86_reg nCoeffs;
-    const uint16_t *quant_matrix;
-
     av_assert2(s->block_last_index[n]>=0);
 
-    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
-    else                 qscale <<= 1;
+    x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : 
(unsigned)qscale << 1;
+    x86_reg offset  = s->intra_scantable.raster_end[s->block_last_index[n]] << 
1;
+    const void *quant_matrix = (const char*)s->inter_matrix + offset;
 
-    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
-        quant_matrix = s->inter_matrix;
 __asm__ volatile(
+                "movd          %k1, %%mm6      \n\t"
+                "lea      (%2, %0), %1         \n\t"
+                "neg            %0             \n\t"
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlq $48, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq     (%1, %0), %%mm0      \n\t"
+                "movq    8(%1, %0), %%mm1      \n\t"
+                "movq     (%3, %0), %%mm4      \n\t"
+                "movq    8(%3, %0), %%mm5      \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -400,8 +400,8 @@ __asm__ volatile(
                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 
1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw  (%1, %0), %%mm4      \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5      \n\t" // block[i] == 0 ? -1 : 0
                 "psrlw $5, %%mm0                \n\t"
                 "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
@@ -412,12 +412,12 @@ __asm__ volatile(
                 "pandn %%mm1, %%mm5             \n\t"
                 "pxor %%mm4, %%mm7              \n\t"
                 "pxor %%mm5, %%mm7              \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq        %%mm4, (%1, %0)   \n\t"
+                "movq        %%mm5, 8(%1, %0)  \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add           $16, %0          \n\t"
                 "jng 1b                         \n\t"
-                "movd 124(%0, %3), %%mm0        \n\t"
+                "movd      124(%2), %%mm0      \n\t"
                 "movq %%mm7, %%mm6              \n\t"
                 "psrlq $32, %%mm7               \n\t"
                 "pxor %%mm6, %%mm7              \n\t"
@@ -427,10 +427,11 @@ __asm__ volatile(
                 "pslld $31, %%mm7               \n\t"
                 "psrlq $15, %%mm7               \n\t"
                 "pxor %%mm7, %%mm0              \n\t"
-                "movd %%mm0, 124(%0, %3)        \n\t"
+                "movd        %%mm0, 124(%2)    \n\t"
 
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "r" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r"(offset), "+r" (qscale2)
+                : "r" (block), "r"(quant_matrix)
+                : "memory"
         );
 }
 

commit 1fa8ffc1db2b62e475545bc6b117215704f9e1d8
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Oct 7 10:35:08 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Improve unquantizing MPEG-2 intra blocks
    
    Unquantizing involves calculating
        (block[j] * qscale * quant_matrix[j]) / 16
    where / rounds towards zero. Arithmetic right shifts
    naturally round towards -inf, so the earlier code
    calculated the absolute value first, then used a right-shift
    and then negated the result if necessary.
    
    This commit uses a different procedure: It biases the product
    for negative values of block[j] by 0xf. The combination of
    this and the arithmetic right shift is the same as rounding
    towards zero.
    
    Furthermore, a write-only store to mm7 has been removed.
    
    Benchmarks:
    dct_unquantize_mpeg2_intra_c:                          214.3 ( 1.00x)
    dct_unquantize_mpeg2_intra_mmx (old):                   43.0 ( 4.98x)
    dct_unquantize_mpeg2_intra_mmx (new):                   28.4 ( 7.56x)
    
    (The bitexact flag and the test for correctness have beem removed
    from checkasm for the benchmarks.)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 38dcd8fc6e..d1614eb1eb 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -321,8 +321,6 @@ static void dct_unquantize_mpeg2_intra_mmx(const MPVContext 
*s,
         block0 = block[0] * s->c_dc_scale;
     quant_matrix = s->intra_matrix;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
                 "movd %2, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
@@ -335,30 +333,18 @@ __asm__ volatile(
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
-                "psraw $4, %%mm0                \n\t"
-                "psraw $4, %%mm1                \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm0, %%mm2              \n\t"
+                "movq %%mm1, %%mm3              \n\t"
+                "psrlw $12, %%mm2               \n\t" // block[i] < 0 ? 0xf : 0
+                "psrlw $12, %%mm3               \n\t" // (block[i] is in the 
-2048..2047 range)
+                "pmullw %%mm4, %%mm0            \n\t" // block[i]*q
+                "pmullw %%mm5, %%mm1            \n\t" // block[i]*q
+                "paddw %%mm2, %%mm0             \n\t" // bias negative block[i]
+                "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
+                "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
+                "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
+                "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
                 "jng 1b                         \n\t"

commit 6d56807a06ce06712c65f8fcbf2a9a444bf59353
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 22:23:50 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/x86/mpegvideo: Use correct inline assembly constraints
    
    The H.263 unquantize functions modified an input parameter.
    (And they did so since this code was added in
    7f3f5ec87bcbf244fce49ffdb476d4ae6e523af6. I am surprised
    that this didn't cause issues, particularly with the intra function.)
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 4c3299362e..38dcd8fc6e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -33,9 +33,8 @@
 static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
                                           int16_t *block, int n, int qscale)
 {
-    x86_reg level, qmul, qadd, nCoeffs;
-
-    qmul = qscale << 1;
+    x86_reg qmul = (unsigned)qscale << 1;
+    int level, qadd;
 
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
@@ -49,16 +48,15 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext 
*s,
         qadd = 0;
         level= block[0];
     }
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+    x86_reg offset = s->ac_pred ? 63 << 1 : 
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
 
 __asm__ volatile(
-                "movd %1, %%mm6                 \n\t" //qmul
+                "movd          %k1, %%mm6       \n\t" //qmul
+                "lea      (%2, %0), %1          \n\t"
+                "neg            %0              \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "movd %2, %%mm5                 \n\t" //qadd
+                "movd           %3, %%mm5       \n\t" //qadd
                 "pxor %%mm7, %%mm7              \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
@@ -66,14 +64,14 @@ __asm__ volatile(
                 "pxor %%mm4, %%mm4              \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %3), %%mm0           \n\t"
-                "movq 8(%0, %3), %%mm1          \n\t"
+                "movq     (%1, %0), %%mm0       \n\t"
+                "movq    8(%1, %0), %%mm1       \n\t"
 
                 "pmullw %%mm6, %%mm0            \n\t"
                 "pmullw %%mm6, %%mm1            \n\t"
 
-                "movq (%0, %3), %%mm2           \n\t"
-                "movq 8(%0, %3), %%mm3          \n\t"
+                "movq     (%1, %0), %%mm2       \n\t"
+                "movq    8(%1, %0), %%mm3       \n\t"
 
                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
@@ -93,12 +91,13 @@ __asm__ volatile(
                 "pandn %%mm2, %%mm0             \n\t"
                 "pandn %%mm3, %%mm1             \n\t"
 
-                "movq %%mm0, (%0, %3)           \n\t"
-                "movq %%mm1, 8(%0, %3)          \n\t"
+                "movq        %%mm0, (%1, %0)    \n\t"
+                "movq        %%mm1, 8(%1, %0)   \n\t"
 
-                "add $16, %3                    \n\t"
+                "add           $16, %0          \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" 
(2*(-nCoeffs))
+                : "+r"(offset), "+r"(qmul)
+                : "r" (block), "rm" (qadd)
                 : "memory"
         );
         block[0]= level;
@@ -108,20 +107,20 @@ __asm__ volatile(
 static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
                                           int16_t *block, int n, int qscale)
 {
-    x86_reg qmul, qadd, nCoeffs;
-
-    qmul = qscale << 1;
-    qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    int qadd = (qscale - 1) | 1;
 
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
-    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+    x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 
1;
 
 __asm__ volatile(
-                "movd %1, %%mm6                 \n\t" //qmul
+                "movd           %2, %%mm6       \n\t" //qmul
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "movd %2, %%mm5                 \n\t" //qadd
+                "movd           %3, %%mm5       \n\t" //qadd
+                "add            %1, %0          \n\t"
+                "neg            %1              \n\t"
                 "pxor %%mm7, %%mm7              \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
@@ -129,14 +128,14 @@ __asm__ volatile(
                 "pxor %%mm4, %%mm4              \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %3), %%mm0           \n\t"
-                "movq 8(%0, %3), %%mm1          \n\t"
+                "movq     (%0, %1), %%mm0       \n\t"
+                "movq    8(%0, %1), %%mm1       \n\t"
 
                 "pmullw %%mm6, %%mm0            \n\t"
                 "pmullw %%mm6, %%mm1            \n\t"
 
-                "movq (%0, %3), %%mm2           \n\t"
-                "movq 8(%0, %3), %%mm3          \n\t"
+                "movq     (%0, %1), %%mm2       \n\t"
+                "movq    8(%0, %1), %%mm3       \n\t"
 
                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
@@ -156,12 +155,13 @@ __asm__ volatile(
                 "pandn %%mm2, %%mm0             \n\t"
                 "pandn %%mm3, %%mm1             \n\t"
 
-                "movq %%mm0, (%0, %3)           \n\t"
-                "movq %%mm1, 8(%0, %3)          \n\t"
+                "movq        %%mm0, (%0, %1)    \n\t"
+                "movq        %%mm1, 8(%0, %1)   \n\t"
 
-                "add $16, %3                    \n\t"
+                "add           $16, %1          \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" 
(2*(-nCoeffs))
+                : "+r" (block), "+r" (offset)
+                : "rm"(qmul), "rm" (qadd)
                 : "memory"
         );
 }

commit 0f7cc6aeeacba070d6d4b76a9f3a4d4036c3bb0b
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 01:17:08 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:43 2025 +0100

    avcodec/mpegvideo: Move ff_init_scantable() to mpegvideo_unquantize.c
    
    This is necessary so that the mpegvideo_unquantize checkasm test
    does not pull mpegvideo.o and then all of libavcodec into checkasm.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index a137fe31db..7ca2c8f701 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -42,7 +42,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
-#include "mpegvideo_unquantize.h"
 #include "libavutil/refstruct.h"
 
 
@@ -79,20 +78,6 @@ static av_cold void dsp_init(MpegEncContext *s)
     }
 }
 
-av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
-                               const uint8_t *src_scantable)
-{
-    st->scantable = src_scantable;
-
-    for (int i = 0, end = -1; i < 64; i++) {
-        int j = src_scantable[i];
-        st->permutated[i] = permutation[j];
-        if (permutation[j] > end)
-            end = permutation[j];
-        st->raster_end[i] = end;
-    }
-}
-
 av_cold void ff_mpv_idct_init(MpegEncContext *s)
 {
     if (s->codec_id == AV_CODEC_ID_MPEG4)
diff --git a/libavcodec/mpegvideo_unquantize.c 
b/libavcodec/mpegvideo_unquantize.c
index 06c29d0753..9297c80b47 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,6 +33,20 @@
 #include "mpegvideodata.h"
 #include "mpegvideo_unquantize.h"
 
+av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
+                               const uint8_t *src_scantable)
+{
+    st->scantable = src_scantable;
+
+    for (int i = 0, end = -1; i < 64; i++) {
+        int j = src_scantable[i];
+        st->permutated[i] = permutation[j];
+        if (permutation[j] > end)
+            end = permutation[j];
+        st->raster_end[i] = end;
+    }
+}
+
 static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
                                          int16_t *block, int n, int qscale)
 {

commit 357fc5243c32300bba91c096488e86558beed4c8
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sat Nov 29 01:05:51 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:39 2025 +0100

    avcodec/{arm,neon}/mpegvideo: Fix h263 unquantize functions
    
    These functions currently operate on the assumption that the number
    of coefficients to process is always of the form 16k+m with m<=4 or >8.
    Yet this is not true when the IDCT permutation is of type 
FF_IDCT_PERM_LIBMPEG2
    (i.e. when FF_IDCT_INT is in use).
    
    Reviewed-by: Martin StorsjÃ¶ <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index c7a35ea267..7e42bdf6c5 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -36,7 +36,7 @@ function ff_dct_unquantize_h263_neon, export=1
         vdup.16         q15, r0                 @ qmul
         vdup.16         q14, r2                 @ qadd
         vneg.s16        q13, q14
-        cmp             r3,  #4
+        cmp             r3,  #8
         mov             r0,  r1
         ble             2f
 1:
@@ -62,14 +62,14 @@ function ff_dct_unquantize_h263_neon, export=1
         cmp             r3,  #8
         bgt             1b
 2:
-        vld1.16         {d0},     [r0,:64]
-        vclt.s16        d3,  d0,  #0
-        vceq.s16        d1,  d0,  #0
-        vmul.s16        d2,  d0,  d30
-        vbsl            d3,  d26, d28
-        vadd.s16        d2,  d2,  d3
-        vbif            d0,  d2,  d1
-        vst1.16         {d0},     [r1,:64]
+        vld1.16         {q0},     [r0,:128]
+        vclt.s16        q3,  q0,  #0
+        vceq.s16        q1,  q0,  #0
+        vmul.s16        q2,  q0,  q15
+        vbsl            q3,  q13, q14
+        vadd.s16        q2,  q2,  q3
+        vbif            q0,  q2,  q1
+        vst1.16         {q0},     [r1,:128]
         bx              lr
 endfunc
 
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index 3427dbe427..44e9b70303 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -39,12 +39,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
 {
     int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16;
     int16x8_t q14s16, q15s16, qzs16;
-    int16x4_t d0s16, d2s16, d3s16, dzs16;
     uint16x8_t q1u16, q9u16;
-    uint16x4_t d1u16;
-
-    dzs16 = vdup_n_s16(0);
-    qzs16 = vdupq_n_s16(0);
 
     q15s16 = vdupq_n_s16(qscale << 1);
     q14s16 = vdupq_n_s16(qadd);
@@ -73,15 +68,14 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
     if (nCoeffs <= 0)
         return;
 
-    d0s16 = vld1_s16(block);
-    d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16));
-    d1u16 = vceq_s16(d0s16, dzs16);
-    d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16));
-    d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16),
-                     vget_high_s16(q13s16), vget_high_s16(q14s16));
-    d2s16 = vadd_s16(d2s16, d3s16);
-    d0s16 = vbsl_s16(d1u16, d0s16, d2s16);
-    vst1_s16(block, d0s16);
+    q0s16 = vld1q_s16(block);
+    q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16));
+    q1u16 = vceqq_s16(q0s16, qzs16);
+    q2s16 = vmulq_s16(q0s16, q15s16);
+    q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16);
+    q2s16 = vaddq_s16(q2s16, q3s16);
+    q0s16 = vbslq_s16(q1u16, q0s16, q2s16);
+    vst1q_s16(block, q0s16);
 }
 
 static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,

commit 581050a1755b335cb106ad1b6c8e5f6fa9c19bd0
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 28 22:25:39 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:23:39 2025 +0100

    tests/checkasm: Add mpegvideo unquantize test
    
    This adds a test for the mpegvideo unquantize functions.
    
    It has been written in order to be able to easily bench
    these functions. It should be noted that the random input
    fed to the tested functions is not necessarily representative
    of the stuff actually occuring in the wild. So benchmarks should
    be taken with a grain of salt; but comparisons between two functions
    that do not depend on branch predictions are valid (the usecase
    for this is to port the x86 mmx functions to use xmm registers).
    
    During testing I have found a bug in the arm/aarch64 neon optimizations
    when using the LIBMPEG2 permutation (used by FF_IDCT_INT): The code
    seems to be based on the presumption that the remainder of the number
    of coefficients to process is always <= 4 mod 16. The test therefore
    sometimes fails for these arches.
    
    Hint: I am not certain that 16 bits are enough for the intermediate
    values of all the computations involved; e.g. both FLV and MPEG-4
    escape values can go beyond that after the corresponding
    multiplications. The input in this test is nevertheless designed
    to fit into 16 bits.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3762c0d83b..b9c8adb21f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -19,6 +19,7 @@ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
 AVCODECOBJS-$(CONFIG_LPC)               += lpc.o
 AVCODECOBJS-$(CONFIG_ME_CMP)            += motion.o
+AVCODECOBJS-$(CONFIG_MPEGVIDEO)         += mpegvideo_unquantize.o
 AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP)   += mpegvideoencdsp.o
 AVCODECOBJS-$(CONFIG_QPELDSP)           += qpeldsp.o
 AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8c64684fa3..a899967937 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -215,6 +215,9 @@ static const struct {
     #if CONFIG_ME_CMP
         { "motion", checkasm_check_motion },
     #endif
+    #if CONFIG_MPEGVIDEO
+        { "mpegvideo_unquantize", checkasm_check_mpegvideo_unquantize },
+    #endif
     #if CONFIG_MPEGVIDEOENCDSP
         { "mpegvideoencdsp", checkasm_check_mpegvideoencdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 05f74ca16b..ec075c4763 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
 void checkasm_check_lpc(void);
 void checkasm_check_motion(void);
+void checkasm_check_mpegvideo_unquantize(void);
 void checkasm_check_mpegvideoencdsp(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
diff --git a/tests/checkasm/mpegvideo_unquantize.c 
b/tests/checkasm/mpegvideo_unquantize.c
new file mode 100644
index 0000000000..837606e60e
--- /dev/null
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -0,0 +1,273 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config.h"
+
+#include "checkasm.h"
+
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/mathops.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
+#include "libavcodec/mpegvideo_unquantize.h"
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define randomize_struct(TYPE, s) do {                    \
+    static_assert(!(_Alignof(TYPE) % 4),                  \
+                  "can't use aligned stores");            \
+    unsigned char *ptr = (unsigned char*)s;               \
+    for (size_t i = 0; i < sizeof(*s) & ~3; i += 4)       \
+        AV_WN32A(ptr + i, rnd());                         \
+    for (size_t i = sizeof(*s) & ~3; i < sizeof(*s); ++i) \
+        ptr[i] = rnd();                                   \
+   } while (0)
+
+enum TestType {
+    H263,
+    MPEG1,
+    MPEG2,
+};
+
+static void init_idct_scantable(MPVContext *const s, int intra_scantable)
+{
+    static const enum idct_permutation_type permutation_types[] = {
+        FF_IDCT_PERM_NONE,
+        FF_IDCT_PERM_LIBMPEG2,
+#if ARCH_X86_32 && HAVE_X86ASM
+        FF_IDCT_PERM_SIMPLE,
+#endif
+#if ARCH_PPC || ARCH_X86
+        FF_IDCT_PERM_TRANSPOSE,
+#endif
+#if ARCH_ARM || ARCH_AARCH64
+        FF_IDCT_PERM_PARTTRANS,
+#endif
+#if ARCH_X86 && HAVE_X86ASM
+        FF_IDCT_PERM_SSE2,
+#endif
+    };
+    // Copied here to avoid #ifs.
+    static const uint8_t ff_wmv1_scantable[][64] = {
+    { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+      0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+      0x30, 0x38, 0x29, 0x21, 0x1A, 0x13, 0x0C, 0x05,
+      0x06, 0x0D, 0x14, 0x1B, 0x22, 0x31, 0x39, 0x3A,
+      0x32, 0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F,
+      0x16, 0x1D, 0x24, 0x2B, 0x33, 0x3B, 0x3C, 0x34,
+      0x2C, 0x25, 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x35,
+      0x3D, 0x3E, 0x36, 0x2E, 0x27, 0x2F, 0x37, 0x3F, },
+    { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+      0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+      0x21, 0x30, 0x1A, 0x13, 0x0C, 0x05, 0x06, 0x0D,
+      0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39, 0x2A,
+      0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F, 0x16, 0x1D,
+      0x24, 0x2B, 0x32, 0x3A, 0x33, 0x3B, 0x2C, 0x25,
+      0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3C, 0x35,
+      0x3D, 0x2E, 0x27, 0x2F, 0x36, 0x3E, 0x37, 0x3F, },
+    { 0x00, 0x01, 0x08, 0x02, 0x03, 0x09, 0x10, 0x18,
+      0x11, 0x0A, 0x04, 0x05, 0x0B, 0x12, 0x19, 0x20,
+      0x28, 0x30, 0x21, 0x1A, 0x13, 0x0C, 0x06, 0x07,
+      0x0D, 0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39,
+      0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x0F, 0x16, 0x1D,
+      0x24, 0x2B, 0x32, 0x3A, 0x33, 0x2C, 0x25, 0x1E,
+      0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3B, 0x3C, 0x35,
+      0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, },
+    { 0x00, 0x08, 0x10, 0x01, 0x18, 0x20, 0x28, 0x09,
+      0x02, 0x03, 0x0A, 0x11, 0x19, 0x30, 0x38, 0x29,
+      0x21, 0x1A, 0x12, 0x0B, 0x04, 0x05, 0x0C, 0x13,
+      0x1B, 0x22, 0x31, 0x39, 0x32, 0x2A, 0x23, 0x1C,
+      0x14, 0x0D, 0x06, 0x07, 0x0E, 0x15, 0x1D, 0x24,
+      0x2B, 0x33, 0x3A, 0x3B, 0x34, 0x2C, 0x25, 0x1E,
+      0x16, 0x0F, 0x17, 0x1F, 0x26, 0x2D, 0x3C, 0x35,
+      0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, }
+    };
+
+    static const uint8_t *const scantables[] = {
+        ff_alternate_vertical_scan,
+        ff_alternate_horizontal_scan,
+        ff_zigzag_direct,
+        ff_wmv1_scantable[0],
+        ff_wmv1_scantable[1],
+        ff_wmv1_scantable[2],
+        ff_wmv1_scantable[3],
+    };
+    static const uint8_t *scantable = NULL;
+    static enum idct_permutation_type idct_permutation;
+
+    if (!scantable) {
+        scantable        = scantables[rnd() % FF_ARRAY_ELEMS(scantables)];
+        idct_permutation = permutation_types[rnd() % 
FF_ARRAY_ELEMS(permutation_types)];
+    }
+    ff_init_scantable_permutation(s->idsp.idct_permutation, idct_permutation);
+    ff_init_scantable(s->idsp.idct_permutation,
+                      intra_scantable ? &s->intra_scantable : 
&s->inter_scantable,
+                      scantable);
+}
+
+static void init_h263_test(MPVContext *const s, int16_t block[64],
+                           int last_nonzero_coeff, int qscale, int intra)
+{
+    const uint8_t *permutation = s->inter_scantable.permutated;
+    if (intra) {
+        permutation = s->intra_scantable.permutated;
+        block[0]    = rnd() & 511;
+        static int h263_aic = -1, ac_pred;
+        if (h263_aic < 0) {
+            h263_aic = rnd() & 1;
+            ac_pred  = rnd() & 1;
+        }
+        s->h263_aic = h263_aic;
+        s->ac_pred  = ac_pred;
+        if (s->ac_pred)
+            last_nonzero_coeff = 63;
+    }
+    for (int i = intra; i <= last_nonzero_coeff; ++i) {
+        int random = rnd();
+        if (random & 1)
+            continue;
+        random >>= 1;
+        // Select level so that the multiplication fits into 16 bits.
+        // FIXME: The FLV and MPEG-4 decoders can have escape values exceeding 
this.
+        block[permutation[i]] = sign_extend(random, 10);
+    }
+}
+
+static void init_mpeg12_test(MPVContext *const s, int16_t block[64],
+                             int last_nonzero_coeff, int qscale, int intra,
+                             enum TestType type)
+{
+    uint16_t *matrix = intra ? s->intra_matrix : s->inter_matrix;
+
+    if (type == MPEG2)
+        qscale = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : qscale 
<< 1;
+
+    for (int i = 0; i < 64; ++i)
+        matrix[i] = 1 + rnd() % 254;
+
+    const uint8_t *permutation = s->intra_scantable.permutated;
+    if (intra) {
+        block[0] = (int8_t)rnd();
+        for (int i = 1; i <= last_nonzero_coeff; ++i) {
+            int j = permutation[i];
+            unsigned random = rnd();
+            if (random & 1)
+                continue;
+            random >>= 1;
+            // Select level so that the multiplication does not overflow
+            // an int16_t and so that it is within the possible range
+            // (-2048..2047). FIXME: It seems that this need not be fulfilled
+            // in practice for the MPEG-4 decoder at least.
+            int limit = FFMIN(INT16_MAX / (qscale * matrix[j]), 2047);
+            block[j] = random % (2 * limit + 1) - limit;
+        }
+    } else {
+        for (int i = 0; i <= last_nonzero_coeff; ++i) {
+            int j = permutation[i];
+            unsigned random = rnd();
+            if (random & 1)
+                continue;
+            random >>= 1;
+            int limit = FFMIN((INT16_MAX / (qscale * matrix[j]) - 1) / 2, 
2047);
+            block[j] = random % (2 * limit + 1) - limit;
+        }
+    }
+}
+
+void checkasm_check_mpegvideo_unquantize(void)
+{
+    static const struct {
+        const char *name;
+        size_t offset;
+        int intra, intra_scantable;
+        enum TestType type;
+    } tests[] = {
+#define TEST(NAME, INTRA, INTRA_SCANTABLE, TYPE)                         \
+    { .name = #NAME, .offset = offsetof(MPVUnquantDSPContext, NAME),     \
+      .intra = INTRA, .intra_scantable = INTRA_SCANTABLE, .type = TYPE }
+        TEST(dct_unquantize_mpeg1_intra, 1, 1, MPEG1),
+        TEST(dct_unquantize_mpeg1_inter, 0, 1, MPEG1),
+        TEST(dct_unquantize_mpeg2_intra, 1, 1, MPEG2),
+        TEST(dct_unquantize_mpeg2_inter, 0, 1, MPEG2),
+        TEST(dct_unquantize_h263_intra,  1, 1, H263),
+        TEST(dct_unquantize_h263_inter,  0, 0, H263),
+    };
+    MPVUnquantDSPContext unquant_dsp_ctx;
+    int q_scale_type = rnd() & 1;
+
+    ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
+    declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block, 
int n, int qscale);
+
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+        void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =
+            *(void (**)(MPVContext *, int16_t *, int, 
int))((char*)&unquant_dsp_ctx + tests[i].offset);
+        if (check_func(func, "%s", tests[i].name)) {
+            MPVContext new, ref;
+            DECLARE_ALIGNED(16, int16_t, block_new)[64];
+            DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+            static int block_last_index = -1;
+
+            randomize_struct(MPVContext, &ref);
+
+            ref.q_scale_type = q_scale_type;
+
+            init_idct_scantable(&ref, tests[i].intra_scantable);
+
+            if (block_last_index < 0)
+                block_last_index = rnd() % 64;
+
+            memset(block_ref, 0, sizeof(block_ref));
+
+            if (tests[i].intra) {
+                // Less restricted than real dc_scale values
+                ref.y_dc_scale = 1 + rnd() % 64;
+                ref.c_dc_scale = 1 + rnd() % 64;
+            }
+
+            static int qscale = 0;
+
+            if (qscale == 0)
+                qscale = 1 + rnd() % 31;
+
+            if (tests[i].type == H263)
+                init_h263_test(&ref, block_ref, block_last_index, qscale,
+                               tests[i].intra);
+            else
+                init_mpeg12_test(&ref, block_ref, block_last_index, qscale,
+                                 tests[i].intra, tests[i].type);
+
+            int n = rnd() % 6;
+            ref.block_last_index[n] = block_last_index;
+
+            memcpy(&new, &ref, sizeof(new));
+            memcpy(block_new, block_ref, sizeof(block_new));
+
+            call_ref(&ref, block_ref, n, qscale);
+            call_new(&new, block_new, n, qscale);
+
+            if (memcmp(&ref, &new, sizeof(new)) || memcmp(block_new, 
block_ref, sizeof(block_new)))
+                fail();
+
+            bench_new(&new, block_new, n, qscale);
+        }
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index f182efde46..48edd17bf2 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -39,6 +39,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-llviddspenc                               \
                 fate-checkasm-lpc                                       \
                 fate-checkasm-motion                                    \
+                fate-checkasm-mpegvideo_unquantize                      \
                 fate-checkasm-mpegvideoencdsp                           \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \

commit e7a629049f7e9be397b0acabe75beb207ad9dc21
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 28 16:58:44 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100

    avcodec/{arm,neon}/mpegvideo: Use intra scantable to unquant H263 intra
    
    Forgotten in 70a7df049c411d9247eb6075720c84196c3e55e8.
    
    Using the wrong scantable matters for codecs for which both scantables
    can differ, namely the MPEG-4 decoder and the WMV1/2 codecs.
    
    For WMV1 it can lead to wrong output in case the IDCT permutation
    is FF_IDCT_PERM_PARTTRANS, because in this case the entries of
    of the intra scantable's raster end are not always <= the corresponding
    entries of the inter scantable's raster end when the former is
    initialized via ff_wmv1_scantable[1] and the latter via 
ff_wmv1_scantable[0].
    FF_IDCT_PERM_PARTTRANS is used iff the Neon IDCT is used (for both arm
    and aarch64).* Said IDCT is not used during FATE, so that this issue
    went unnoticed.
    
    WMV2 uses the same scantables, but uses a custom IDCT
    which always uses FF_IDCT_PERM_NONE for which the inter_scantable,
    so that the output is always correct for it.
    
    The scantable for MPEG-4 can change mid-stream (for the decoder),
    but since c41818dc5dc14eb944761204e7b0ac179a6dcd1a only the intra
    scantable is updated, so that both scantables can get out of sync.
    In such a case the unquantize intra functions could unquantize
    an incorrect number of coefficients.
    
    Using raster_end of the wrong scantable can also lead to an
    unnecessarily large amount of coefficients unquantized.
    
    *: FF_IDCT_PERM_SIMPLE and FF_IDCT_PERM_TRANSPOSE would also not work,
    but they are not used at all by arm and aarch64.
    
    Reviewed-by: Martin StorsjÃ¶ <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index a2174b0a08..67e1f2ff6d 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -28,5 +28,6 @@
 #define BLOCK_LAST_INDEX         0x10
 #define H263_AIC                 0x40
 #define INTER_SCANTAB_RASTER_END 0x88
+#define INTRA_SCANTAB_RASTER_END 0x10c
 
 #endif /* AVCODEC_ARM_ASM_OFFSETS_H */
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index cb109cd832..593e998181 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -38,6 +38,8 @@ CHECK_OFFSET(MpegEncContext, ac_pred,          AC_PRED);
 CHECK_OFFSET(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
 CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
              INTER_SCANTAB_RASTER_END);
+CHECK_OFFSET(MpegEncContext, intra_scantable.raster_end,
+             INTRA_SCANTAB_RASTER_END);
 CHECK_OFFSET(MpegEncContext, h263_aic,         H263_AIC);
 #endif
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c 
b/libavcodec/arm/mpegvideo_armv5te.c
index 3a6d015767..b2790b48fe 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -73,7 +73,7 @@ static void dct_unquantize_h263_intra_armv5te(const 
MPVContext *s,
     if(s->ac_pred)
         nCoeffs=63;
     else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
 
     ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
     block[0] = level;
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 1889d7a912..c7a35ea267 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -77,7 +77,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
         push            {r4-r6,lr}
         add             r12, r0,  #BLOCK_LAST_INDEX
         ldr             r6,  [r0, #AC_PRED]
-        add             lr,  r0,  #INTER_SCANTAB_RASTER_END
+        add             lr,  r0,  #INTRA_SCANTAB_RASTER_END
         cmp             r6,  #0
         it              ne
         movne           r12, #63
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index e21ce5164d..758bf57ab9 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -72,11 +72,11 @@ typedef struct MpegEncContext {
 
     /* scantables */
     ScanTable inter_scantable; ///< if inter == intra then intra should be 
used to reduce the cache usage
+    ScanTable intra_scantable;
 
     /* WARNING: changes above this line require updates to hardcoded
      *          offsets used in ASM. */
 
-    ScanTable intra_scantable;
     uint8_t permutated_intra_h_scantable[64];
     uint8_t permutated_intra_v_scantable[64];
 
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fdc57d3876..3427dbe427 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -112,7 +112,7 @@ static void dct_unquantize_h263_intra_neon(const MPVContext 
*s, int16_t *block,
     if (s->ac_pred) {
         nCoeffs = 63;
     } else {
-        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
         if (nCoeffs <= 0)
             return;
     }

commit 5d41d3e21dff14058b283491480a7382daeb5da9
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 19 12:00:08 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100

    avcodec/ppc/mpegvideo_altivec: Reindent after the previous commit
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index 7b54de3d91..71894e760b 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -43,30 +43,30 @@
 static av_always_inline
 void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int 
qmul)
 {
-        register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
-        DECLARE_ALIGNED(16, short, qmul8) = qmul;
-        DECLARE_ALIGNED(16, short, qadd8) = qadd;
-        register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
-        register vector bool short blockv_null, blockv_neg;
+    register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
+    DECLARE_ALIGNED(16, short, qmul8) = qmul;
+    DECLARE_ALIGNED(16, short, qadd8) = qadd;
+    register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
+    register vector bool short blockv_null, blockv_neg;
 
-        qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
-        qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
-        nqaddv = vec_sub(vczero, qaddv);
+    qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
+    qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
+    nqaddv = vec_sub(vczero, qaddv);
 
-        // vectorize all the 16 bytes-aligned blocks
-        // of 8 elements
-        for (register int j = 0; j <= nb_coeffs; j += 8) {
-            blockv = vec_ld(j << 1, block);
-            blockv_neg = vec_cmplt(blockv, vczero);
-            blockv_null = vec_cmpeq(blockv, vczero);
-            // choose between +qadd or -qadd as the third operand
-            temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
-            // multiply & add (block{i,i+7} * qmul [+-] qadd)
-            temp1 = vec_mladd(blockv, qmulv, temp1);
-            // put 0 where block[{i,i+7} used to have 0
-            blockv = vec_sel(temp1, blockv, blockv_null);
-            vec_st(blockv, j << 1, block);
-        }
+    // vectorize all the 16 bytes-aligned blocks
+    // of 8 elements
+    for (register int j = 0; j <= nb_coeffs; j += 8) {
+        blockv = vec_ld(j << 1, block);
+        blockv_neg = vec_cmplt(blockv, vczero);
+        blockv_null = vec_cmpeq(blockv, vczero);
+        // choose between +qadd or -qadd as the third operand
+        temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
+        // multiply & add (block{i,i+7} * qmul [+-] qadd)
+        temp1 = vec_mladd(blockv, qmulv, temp1);
+        // put 0 where block[{i,i+7} used to have 0
+        blockv = vec_sel(temp1, blockv, blockv_null);
+        vec_st(blockv, j << 1, block);
+    }
 }
 
 static void dct_unquantize_h263_intra_altivec(const MPVContext *s,

commit 011ef7fc65fcbf2141adcec9ca805874bb0a6a16
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Wed Nov 19 11:51:03 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:42 2025 +0100

    avcodec/ppc/mpegvideo_altivec: Split intra/inter unquantizing
    
    Don't use a single function that checks mb_intra. Forgotten
    in d50635cd247e17fe16c63219b9ae80d45a8185b1.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index ad3a783a87..7b54de3d91 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,41 +40,14 @@
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(const MPVContext *s,
-                                        int16_t *block, int n, int qscale)
+static av_always_inline
+void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int 
qmul)
 {
-    int i, qmul, qadd;
-    int nCoeffs;
-
-    qadd = (qscale - 1) | 1;
-    qmul = qscale << 1;
-
-    if (s->mb_intra) {
-        if (!s->h263_aic) {
-            if (n < 4)
-                block[0] = block[0] * s->y_dc_scale;
-            else
-                block[0] = block[0] * s->c_dc_scale;
-        }else
-            qadd = 0;
-        i = 1;
-        if (s->ac_pred)
-            nCoeffs = 63;
-        else
-            nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
-    } else {
-        i = 0;
-        av_assert2(s->block_last_index[n]>=0);
-        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
-    }
-
-    {
         register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
         DECLARE_ALIGNED(16, short, qmul8) = qmul;
         DECLARE_ALIGNED(16, short, qadd8) = qadd;
         register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
         register vector bool short blockv_null, blockv_neg;
-        register short backup_0 = block[0];
 
         qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
         qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
@@ -82,7 +55,7 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
 
         // vectorize all the 16 bytes-aligned blocks
         // of 8 elements
-        for (register int j = 0; j <= nCoeffs ; j += 8) {
+        for (register int j = 0; j <= nb_coeffs; j += 8) {
             blockv = vec_ld(j << 1, block);
             blockv_neg = vec_cmplt(blockv, vczero);
             blockv_null = vec_cmpeq(blockv, vczero);
@@ -94,14 +67,36 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
             blockv = vec_sel(temp1, blockv, blockv_null);
             vec_st(blockv, j << 1, block);
         }
+}
 
-        if (i == 1) {
-            // cheat. this avoid special-casing the first iteration
-            block[0] = backup_0;
-        }
-    }
+static void dct_unquantize_h263_intra_altivec(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
+{
+    int qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    int block0 = block[0];
+    if (!s->h263_aic) {
+        block0 *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+    } else
+        qadd = 0;
+    int nb_coeffs = s->ac_pred ? 63 : 
s->intra_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+
+    // cheat. this avoid special-casing the first iteration
+    block[0] = block0;
 }
 
+static void dct_unquantize_h263_inter_altivec(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
+{
+    int qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    av_assert2(s->block_last_index[n]>=0);
+    int nb_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+}
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
@@ -110,7 +105,7 @@ av_cold void 
ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
-    s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_altivec;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_altivec;
 #endif /* HAVE_ALTIVEC */
 }

commit 358c569b05bc6f9a107a5caebcc8da56e8bf9799
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Fri Nov 14 11:24:45 2025 +0100
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Wed Dec 3 10:20:41 2025 +0100

    avcodec/mpegvideo_unquantize: Constify MPVContext pointee
    
    Also use MPVContext instead of MpegEncContext.
    
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 5c96c9df2c..cb109cd832 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -41,9 +41,9 @@ CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
 CHECK_OFFSET(MpegEncContext, h263_aic,         H263_AIC);
 #endif
 
-void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
                                        int n, int qscale);
-void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
                                        int n, int qscale);
 
 av_cold void ff_mpv_unquantize_init_arm(MPVUnquantDSPContext *s, int bitexact)
diff --git a/libavcodec/arm/mpegvideo_armv5te.c 
b/libavcodec/arm/mpegvideo_armv5te.c
index 2737f68643..3a6d015767 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -50,8 +50,8 @@ static inline void dct_unquantize_h263_helper_c(int16_t 
*block, int qmul, int qa
 }
 #endif
 
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_armv5te(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
 {
     int level, qmul, qadd;
     int nCoeffs;
@@ -79,8 +79,8 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext 
*s,
     block[0] = level;
 }
 
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_armv5te(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
 {
     int qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
index d4de2233a7..5ea9fcbb88 100644
--- a/libavcodec/mips/h263dsp_mips.h
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -25,11 +25,11 @@
 
 void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
 void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s, int16_t *block,
                                        int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s, int16_t *block,
                                       int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s, int16_t *block,
                                       int32_t index, int32_t q_scale);
 int ff_pix_sum_msa(const uint8_t *pix, ptrdiff_t line_size);
 
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 2a9ea4006e..2544279ac5 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -23,16 +23,16 @@
 
 #include "libavcodec/mpegvideo.h"
 
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
 void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64]);
 
 #endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
index 87d4aafd8c..90bd90c147 100644
--- a/libavcodec/mips/mpegvideo_mmi.c
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -25,8 +25,8 @@
 #include "mpegvideo_mips.h"
 #include "libavutil/mips/mmiutils.h"
 
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale)
 {
     int64_t level, nCoeffs;
     double ftmp[6];
@@ -101,8 +101,8 @@ void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, 
int16_t *block,
     block[0] = level;
 }
 
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale)
 {
     int64_t nCoeffs;
     double ftmp[6];
@@ -160,8 +160,8 @@ void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, 
int16_t *block,
     );
 }
 
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     int64_t nCoeffs;
     const uint16_t *quant_matrix;
@@ -254,8 +254,8 @@ void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, 
int16_t *block,
     block[0] = block0;
 }
 
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     int64_t nCoeffs;
     const uint16_t *quant_matrix;
@@ -342,8 +342,8 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, 
int16_t *block,
     );
 }
 
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     uint64_t nCoeffs;
     const uint16_t *quant_matrix;
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
index cd4adc0f77..a870a2cd79 100644
--- a/libavcodec/mips/mpegvideo_msa.c
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -194,7 +194,7 @@ static int32_t mpeg2_dct_unquantize_inter_msa(int16_t 
*block,
     return sum_res;
 }
 
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s,
                                       int16_t *block, int32_t index,
                                       int32_t qscale)
 {
@@ -219,7 +219,7 @@ void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
 }
 
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s,
                                       int16_t *block, int32_t index,
                                       int32_t qscale)
 {
@@ -236,7 +236,7 @@ void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
 }
 
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s,
                                        int16_t *block, int32_t index,
                                        int32_t qscale)
 {
diff --git a/libavcodec/mpeg4videodec.h b/libavcodec/mpeg4videodec.h
index aafde454ea..2eafa1ef8b 100644
--- a/libavcodec/mpeg4videodec.h
+++ b/libavcodec/mpeg4videodec.h
@@ -93,11 +93,11 @@ typedef struct Mpeg4DecContext {
 
     Mpeg4VideoDSPContext mdsp;
 
-    void (*dct_unquantize_mpeg2_inter)(MpegEncContext *s,
+    void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
                                        int16_t *block, int n, int qscale);
-    void (*dct_unquantize_mpeg2_intra)(MpegEncContext *s,
+    void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
                                        int16_t *block, int n, int qscale);
-    void (*dct_unquantize_h263_intra)(MpegEncContext *s,
+    void (*dct_unquantize_h263_intra)(const MPVContext *s,
                                       int16_t *block, int n, int qscale);
 
     union {
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index cb4b99acd3..e21ce5164d 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -57,6 +57,8 @@ enum OutputFormat {
     FMT_SPEEDHQ,
 };
 
+typedef struct MpegEncContext MPVContext;
+
 /**
  * MpegEncContext.
  */
@@ -271,10 +273,10 @@ typedef struct MpegEncContext {
     int interlaced_dct;
     int first_field;         ///< is 1 for the first field of a field picture 
0 otherwise
 
-    void (*dct_unquantize_intra)(struct MpegEncContext *s, // unquantizer to 
use (MPEG-4 can use both)
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_inter)(struct MpegEncContext *s, // unquantizer to 
use (MPEG-4 can use both)
-                           int16_t *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_intra)(const MPVContext *s, // unquantizer to use 
(MPEG-4 can use both)
+                                 int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_inter)(const MPVContext *s, // unquantizer to use 
(MPEG-4 can use both)
+                                 int16_t *block/*align 16*/, int n, int 
qscale);
 
     /* flag to indicate a reinitialization is required, e.g. after
      * a frame size change */
@@ -286,8 +288,6 @@ typedef struct MpegEncContext {
     ERContext er;
 } MpegEncContext;
 
-typedef MpegEncContext MPVContext;
-
 /**
  * Set the given MpegEncContext to common defaults (same for encoding
  * and decoding).  The changed fields will not depend upon the prior
diff --git a/libavcodec/mpegvideo_unquantize.c 
b/libavcodec/mpegvideo_unquantize.c
index 213e37a514..06c29d0753 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,8 +33,8 @@
 #include "mpegvideodata.h"
 #include "mpegvideo_unquantize.h"
 
-static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -62,8 +62,8 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -91,8 +91,8 @@ static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -120,8 +120,8 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_bitexact(const MPVContext *s,
+                                                int16_t *block, int n, int 
qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -153,8 +153,8 @@ static void 
dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
     block[63]^=sum&1;
 }
 
-static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -186,8 +186,8 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
     block[63]^=sum&1;
 }
 
-static void dct_unquantize_h263_intra_c(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_c(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, level, qmul, qadd;
     int nCoeffs;
@@ -220,8 +220,8 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_h263_inter_c(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_c(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, level, qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/mpegvideo_unquantize.h 
b/libavcodec/mpegvideo_unquantize.h
index 3e6d8aedf7..1a43f467c6 100644
--- a/libavcodec/mpegvideo_unquantize.h
+++ b/libavcodec/mpegvideo_unquantize.h
@@ -29,21 +29,21 @@
 
 #include "config.h"
 
-typedef struct MpegEncContext MpegEncContext;
+typedef struct MpegEncContext MPVContext;
 
 typedef struct MPVUnquantDSPContext {
-    void (*dct_unquantize_mpeg1_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg1_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg2_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg2_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_h263_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_h263_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_mpeg1_intra)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg1_inter)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_h263_intra)(const MPVContext *s,
+                                      int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_h263_inter)(const MPVContext *s,
+                                      int16_t *block/*align 16*/, int n, int 
qscale);
 } MPVUnquantDSPContext;
 
 #if !ARCH_MIPS
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index a0276ad808..fdc57d3876 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -84,7 +84,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
     vst1_s16(block, d0s16);
 }
 
-static void dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
                                            int n, int qscale)
 {
     int nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
@@ -93,7 +93,7 @@ static void dct_unquantize_h263_inter_neon(MpegEncContext *s, 
int16_t *block,
     ff_dct_unquantize_h263_neon(qscale, qadd, nCoeffs + 1, block);
 }
 
-static void dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
                                            int n, int qscale)
 {
     int qadd;
diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index 26e98acfb8..ad3a783a87 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,8 +40,8 @@
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(MpegEncContext *s,
-                                 int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_altivec(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 8632acd412..4c3299362e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,8 @@
 
 #if HAVE_MMX_INLINE
 
-static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
+                                          int16_t *block, int n, int qscale)
 {
     x86_reg level, qmul, qadd, nCoeffs;
 
@@ -105,8 +105,8 @@ __asm__ volatile(
 }
 
 
-static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
+                                          int16_t *block, int n, int qscale)
 {
     x86_reg qmul, qadd, nCoeffs;
 
@@ -166,8 +166,8 @@ __asm__ volatile(
         );
 }
 
-static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -235,8 +235,8 @@ __asm__ volatile(
     block[0]= block0;
 }
 
-static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -301,8 +301,8 @@ __asm__ volatile(
         );
 }
 
-static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -369,8 +369,8 @@ __asm__ volatile(
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
 }
 
-static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/arm/asm-offsets.h          |   1 +
 libavcodec/arm/mpegvideo_arm.c        |   6 +-
 libavcodec/arm/mpegvideo_armv5te.c    |  10 +-
 libavcodec/arm/mpegvideo_neon.S       |  20 +-
 libavcodec/mips/h263dsp_mips.h        |   6 +-
 libavcodec/mips/mpegvideo_mips.h      |  20 +-
 libavcodec/mips/mpegvideo_mmi.c       |  20 +-
 libavcodec/mips/mpegvideo_msa.c       |   6 +-
 libavcodec/mpeg4videodec.h            |   6 +-
 libavcodec/mpegvideo.c                |  15 -
 libavcodec/mpegvideo.h                |  24 +-
 libavcodec/mpegvideo_unquantize.c     |  42 ++-
 libavcodec/mpegvideo_unquantize.h     |  26 +-
 libavcodec/neon/mpegvideo.c           |  28 +-
 libavcodec/ppc/mpegvideo_altivec.c    | 105 +++---
 libavcodec/x86/mpegvideo.c            | 595 +++++++++++++++-------------------
 tests/checkasm/Makefile               |   1 +
 tests/checkasm/checkasm.c             |   3 +
 tests/checkasm/checkasm.h             |   1 +
 tests/checkasm/mpegvideo_unquantize.c | 273 ++++++++++++++++
 tests/fate/checkasm.mak               |   1 +
 21 files changed, 704 insertions(+), 505 deletions(-)
 create mode 100644 tests/checkasm/mpegvideo_unquantize.c


hooks/post-receive
--

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. c22c2c5e03 avcodec/mpegvideo: Port dct_unquantize_mpeg2_intra_mmx to SSE2

Reply via email to