[FFmpeg-devel] [PATCH] Port mpegvideo unquantize to SSE2/SSSE3 (PR #21049)

mkver via ffmpeg-devel Sat, 29 Nov 2025 15:16:25 -0800

PR #21049 opened by mkver
URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21049
Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21049.patch


Also add a checkasm test and fix some bugs in the aarch64/arm unquantize 
functions.


>From 9b23d53119d9291c83d07f085e257183026ec938 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 14 Nov 2025 11:24:45 +0100
Subject: [PATCH 01/16] avcodec/mpegvideo_unquantize: Constify MPVContext
 pointee

Also use MPVContext instead of MpegEncContext.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/arm/mpegvideo_arm.c     |  4 ++--
 libavcodec/arm/mpegvideo_armv5te.c |  8 ++++----
 libavcodec/mips/h263dsp_mips.h     |  6 +++---
 libavcodec/mips/mpegvideo_mips.h   | 20 ++++++++++----------
 libavcodec/mips/mpegvideo_mmi.c    | 20 ++++++++++----------
 libavcodec/mips/mpegvideo_msa.c    |  6 +++---
 libavcodec/mpeg4videodec.h         |  6 +++---
 libavcodec/mpegvideo.h             | 12 ++++++------
 libavcodec/mpegvideo_unquantize.c  | 28 ++++++++++++++--------------
 libavcodec/mpegvideo_unquantize.h  | 26 +++++++++++++-------------
 libavcodec/neon/mpegvideo.c        |  4 ++--
 libavcodec/ppc/mpegvideo_altivec.c |  4 ++--
 libavcodec/x86/mpegvideo.c         | 24 ++++++++++++------------
 13 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index 5c96c9df2c..cb109cd832 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -41,9 +41,9 @@ CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
 CHECK_OFFSET(MpegEncContext, h263_aic,         H263_AIC);
 #endif
 
-void ff_dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
                                        int n, int qscale);
-void ff_dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
                                        int n, int qscale);
 
 av_cold void ff_mpv_unquantize_init_arm(MPVUnquantDSPContext *s, int bitexact)
diff --git a/libavcodec/arm/mpegvideo_armv5te.c 
b/libavcodec/arm/mpegvideo_armv5te.c
index 2737f68643..3a6d015767 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -50,8 +50,8 @@ static inline void dct_unquantize_h263_helper_c(int16_t 
*block, int qmul, int qa
 }
 #endif
 
-static void dct_unquantize_h263_intra_armv5te(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_armv5te(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
 {
     int level, qmul, qadd;
     int nCoeffs;
@@ -79,8 +79,8 @@ static void dct_unquantize_h263_intra_armv5te(MpegEncContext 
*s,
     block[0] = level;
 }
 
-static void dct_unquantize_h263_inter_armv5te(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_armv5te(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
 {
     int qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/mips/h263dsp_mips.h b/libavcodec/mips/h263dsp_mips.h
index d4de2233a7..5ea9fcbb88 100644
--- a/libavcodec/mips/h263dsp_mips.h
+++ b/libavcodec/mips/h263dsp_mips.h
@@ -25,11 +25,11 @@
 
 void ff_h263_h_loop_filter_msa(uint8_t *src, int stride, int q_scale);
 void ff_h263_v_loop_filter_msa(uint8_t *src, int stride, int q_scale);
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s, int16_t *block,
                                        int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s, int16_t *block,
                                       int32_t index, int32_t q_scale);
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s, int16_t *block,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s, int16_t *block,
                                       int32_t index, int32_t q_scale);
 int ff_pix_sum_msa(const uint8_t *pix, ptrdiff_t line_size);
 
diff --git a/libavcodec/mips/mpegvideo_mips.h b/libavcodec/mips/mpegvideo_mips.h
index 2a9ea4006e..2544279ac5 100644
--- a/libavcodec/mips/mpegvideo_mips.h
+++ b/libavcodec/mips/mpegvideo_mips.h
@@ -23,16 +23,16 @@
 
 #include "libavcodec/mpegvideo.h"
 
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale);
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale);
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale);
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale);
 void ff_denoise_dct_mmi(int16_t block[64], int sum[64], const uint16_t 
offset[64]);
 
 #endif /* AVCODEC_MIPS_MPEGVIDEO_MIPS_H */
diff --git a/libavcodec/mips/mpegvideo_mmi.c b/libavcodec/mips/mpegvideo_mmi.c
index 87d4aafd8c..90bd90c147 100644
--- a/libavcodec/mips/mpegvideo_mmi.c
+++ b/libavcodec/mips/mpegvideo_mmi.c
@@ -25,8 +25,8 @@
 #include "mpegvideo_mips.h"
 #include "libavutil/mips/mmiutils.h"
 
-void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_h263_intra_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale)
 {
     int64_t level, nCoeffs;
     double ftmp[6];
@@ -101,8 +101,8 @@ void ff_dct_unquantize_h263_intra_mmi(MpegEncContext *s, 
int16_t *block,
     block[0] = level;
 }
 
-void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_h263_inter_mmi(const MPVContext *s, int16_t *block,
+                                      int n, int qscale)
 {
     int64_t nCoeffs;
     double ftmp[6];
@@ -160,8 +160,8 @@ void ff_dct_unquantize_h263_inter_mmi(MpegEncContext *s, 
int16_t *block,
     );
 }
 
-void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg1_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     int64_t nCoeffs;
     const uint16_t *quant_matrix;
@@ -254,8 +254,8 @@ void ff_dct_unquantize_mpeg1_intra_mmi(MpegEncContext *s, 
int16_t *block,
     block[0] = block0;
 }
 
-void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg1_inter_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     int64_t nCoeffs;
     const uint16_t *quant_matrix;
@@ -342,8 +342,8 @@ void ff_dct_unquantize_mpeg1_inter_mmi(MpegEncContext *s, 
int16_t *block,
     );
 }
 
-void ff_dct_unquantize_mpeg2_intra_mmi(MpegEncContext *s, int16_t *block,
-        int n, int qscale)
+void ff_dct_unquantize_mpeg2_intra_mmi(const MPVContext *s, int16_t *block,
+                                       int n, int qscale)
 {
     uint64_t nCoeffs;
     const uint16_t *quant_matrix;
diff --git a/libavcodec/mips/mpegvideo_msa.c b/libavcodec/mips/mpegvideo_msa.c
index cd4adc0f77..a870a2cd79 100644
--- a/libavcodec/mips/mpegvideo_msa.c
+++ b/libavcodec/mips/mpegvideo_msa.c
@@ -194,7 +194,7 @@ static int32_t mpeg2_dct_unquantize_inter_msa(int16_t 
*block,
     return sum_res;
 }
 
-void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_intra_msa(const MPVContext *s,
                                       int16_t *block, int32_t index,
                                       int32_t qscale)
 {
@@ -219,7 +219,7 @@ void ff_dct_unquantize_h263_intra_msa(MpegEncContext *s,
     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 1);
 }
 
-void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_h263_inter_msa(const MPVContext *s,
                                       int16_t *block, int32_t index,
                                       int32_t qscale)
 {
@@ -236,7 +236,7 @@ void ff_dct_unquantize_h263_inter_msa(MpegEncContext *s,
     h263_dct_unquantize_msa(block, qmul, qadd, nCoeffs, 0);
 }
 
-void ff_dct_unquantize_mpeg2_inter_msa(MpegEncContext *s,
+void ff_dct_unquantize_mpeg2_inter_msa(const MPVContext *s,
                                        int16_t *block, int32_t index,
                                        int32_t qscale)
 {
diff --git a/libavcodec/mpeg4videodec.h b/libavcodec/mpeg4videodec.h
index aafde454ea..2eafa1ef8b 100644
--- a/libavcodec/mpeg4videodec.h
+++ b/libavcodec/mpeg4videodec.h
@@ -93,11 +93,11 @@ typedef struct Mpeg4DecContext {
 
     Mpeg4VideoDSPContext mdsp;
 
-    void (*dct_unquantize_mpeg2_inter)(MpegEncContext *s,
+    void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
                                        int16_t *block, int n, int qscale);
-    void (*dct_unquantize_mpeg2_intra)(MpegEncContext *s,
+    void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
                                        int16_t *block, int n, int qscale);
-    void (*dct_unquantize_h263_intra)(MpegEncContext *s,
+    void (*dct_unquantize_h263_intra)(const MPVContext *s,
                                       int16_t *block, int n, int qscale);
 
     union {
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index cb4b99acd3..e21ce5164d 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -57,6 +57,8 @@ enum OutputFormat {
     FMT_SPEEDHQ,
 };
 
+typedef struct MpegEncContext MPVContext;
+
 /**
  * MpegEncContext.
  */
@@ -271,10 +273,10 @@ typedef struct MpegEncContext {
     int interlaced_dct;
     int first_field;         ///< is 1 for the first field of a field picture 
0 otherwise
 
-    void (*dct_unquantize_intra)(struct MpegEncContext *s, // unquantizer to 
use (MPEG-4 can use both)
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_inter)(struct MpegEncContext *s, // unquantizer to 
use (MPEG-4 can use both)
-                           int16_t *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_intra)(const MPVContext *s, // unquantizer to use 
(MPEG-4 can use both)
+                                 int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_inter)(const MPVContext *s, // unquantizer to use 
(MPEG-4 can use both)
+                                 int16_t *block/*align 16*/, int n, int 
qscale);
 
     /* flag to indicate a reinitialization is required, e.g. after
      * a frame size change */
@@ -286,8 +288,6 @@ typedef struct MpegEncContext {
     ERContext er;
 } MpegEncContext;
 
-typedef MpegEncContext MPVContext;
-
 /**
  * Set the given MpegEncContext to common defaults (same for encoding
  * and decoding).  The changed fields will not depend upon the prior
diff --git a/libavcodec/mpegvideo_unquantize.c 
b/libavcodec/mpegvideo_unquantize.c
index 213e37a514..06c29d0753 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,8 +33,8 @@
 #include "mpegvideodata.h"
 #include "mpegvideo_unquantize.h"
 
-static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -62,8 +62,8 @@ static void dct_unquantize_mpeg1_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -91,8 +91,8 @@ static void dct_unquantize_mpeg1_inter_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -120,8 +120,8 @@ static void dct_unquantize_mpeg2_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_bitexact(const MPVContext *s,
+                                                int16_t *block, int n, int 
qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -153,8 +153,8 @@ static void 
dct_unquantize_mpeg2_intra_bitexact(MpegEncContext *s,
     block[63]^=sum&1;
 }
 
-static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
-                                   int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_c(const MPVContext *s,
+                                         int16_t *block, int n, int qscale)
 {
     int i, level, nCoeffs;
     const uint16_t *quant_matrix;
@@ -186,8 +186,8 @@ static void dct_unquantize_mpeg2_inter_c(MpegEncContext *s,
     block[63]^=sum&1;
 }
 
-static void dct_unquantize_h263_intra_c(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_c(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, level, qmul, qadd;
     int nCoeffs;
@@ -220,8 +220,8 @@ static void dct_unquantize_h263_intra_c(MpegEncContext *s,
     }
 }
 
-static void dct_unquantize_h263_inter_c(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_c(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, level, qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/mpegvideo_unquantize.h 
b/libavcodec/mpegvideo_unquantize.h
index 3e6d8aedf7..1a43f467c6 100644
--- a/libavcodec/mpegvideo_unquantize.h
+++ b/libavcodec/mpegvideo_unquantize.h
@@ -29,21 +29,21 @@
 
 #include "config.h"
 
-typedef struct MpegEncContext MpegEncContext;
+typedef struct MpegEncContext MPVContext;
 
 typedef struct MPVUnquantDSPContext {
-    void (*dct_unquantize_mpeg1_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg1_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg2_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_mpeg2_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_h263_intra)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
-    void (*dct_unquantize_h263_inter)(struct MpegEncContext *s,
-                           int16_t *block/*align 16*/, int n, int qscale);
+    void (*dct_unquantize_mpeg1_intra)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg1_inter)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg2_intra)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_mpeg2_inter)(const MPVContext *s,
+                                       int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_h263_intra)(const MPVContext *s,
+                                      int16_t *block/*align 16*/, int n, int 
qscale);
+    void (*dct_unquantize_h263_inter)(const MPVContext *s,
+                                      int16_t *block/*align 16*/, int n, int 
qscale);
 } MPVUnquantDSPContext;
 
 #if !ARCH_MIPS
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index a0276ad808..fdc57d3876 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -84,7 +84,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
     vst1_s16(block, d0s16);
 }
 
-static void dct_unquantize_h263_inter_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
                                            int n, int qscale)
 {
     int nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
@@ -93,7 +93,7 @@ static void dct_unquantize_h263_inter_neon(MpegEncContext *s, 
int16_t *block,
     ff_dct_unquantize_h263_neon(qscale, qadd, nCoeffs + 1, block);
 }
 
-static void dct_unquantize_h263_intra_neon(MpegEncContext *s, int16_t *block,
+static void dct_unquantize_h263_intra_neon(const MPVContext *s, int16_t *block,
                                            int n, int qscale)
 {
     int qadd;
diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index 26e98acfb8..ad3a783a87 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,8 +40,8 @@
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(MpegEncContext *s,
-                                 int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_altivec(const MPVContext *s,
+                                        int16_t *block, int n, int qscale)
 {
     int i, qmul, qadd;
     int nCoeffs;
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 8632acd412..4c3299362e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,8 @@
 
 #if HAVE_MMX_INLINE
 
-static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
+                                          int16_t *block, int n, int qscale)
 {
     x86_reg level, qmul, qadd, nCoeffs;
 
@@ -105,8 +105,8 @@ __asm__ volatile(
 }
 
 
-static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
-                                  int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
+                                          int16_t *block, int n, int qscale)
 {
     x86_reg qmul, qadd, nCoeffs;
 
@@ -166,8 +166,8 @@ __asm__ volatile(
         );
 }
 
-static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -235,8 +235,8 @@ __asm__ volatile(
     block[0]= block0;
 }
 
-static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -301,8 +301,8 @@ __asm__ volatile(
         );
 }
 
-static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -369,8 +369,8 @@ __asm__ volatile(
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
 }
 
-static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
-                                     int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
+                                           int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
-- 
2.49.1


>From 3f0aa88e8986816757da62e03ec0b79d1ba9e438 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 14 Nov 2025 12:10:18 +0100
Subject: [PATCH 02/16] avcodec/idctdsp: Disable unused permutation code

FF_IDCT_PERM_PARTTRANS is AARCH64, ARM-only.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/idctdsp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index 8a71c7ef77..b166fb7f92 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -60,10 +60,12 @@ av_cold void ff_init_scantable_permutation(uint8_t 
*idct_permutation,
         for (i = 0; i < 64; i++)
             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
         break;
+#if ARCH_ARM || ARCH_AARCH64
     case FF_IDCT_PERM_PARTTRANS:
         for (i = 0; i < 64; i++)
             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
         break;
+#endif
     default:
         av_log(NULL, AV_LOG_ERROR,
                "Internal error, IDCT permutation not set\n");
-- 
2.49.1


>From 141ecad63981e2a3a54921b7c0b752226a808941 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 15 Nov 2025 08:31:26 +0100
Subject: [PATCH 03/16] avcodec/idctdsp: Optimize impossible permutations away

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/idctdsp.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libavcodec/idctdsp.c b/libavcodec/idctdsp.c
index b166fb7f92..23958373d9 100644
--- a/libavcodec/idctdsp.c
+++ b/libavcodec/idctdsp.c
@@ -56,10 +56,12 @@ av_cold void ff_init_scantable_permutation(uint8_t 
*idct_permutation,
         for (i = 0; i < 64; i++)
             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
         break;
+#if ARCH_PPC || ARCH_X86
     case FF_IDCT_PERM_TRANSPOSE:
         for (i = 0; i < 64; i++)
             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
         break;
+#endif
 #if ARCH_ARM || ARCH_AARCH64
     case FF_IDCT_PERM_PARTTRANS:
         for (i = 0; i < 64; i++)
-- 
2.49.1


>From 1efa206583fd0bd108df5a6113cc7d34597f791e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 19 Nov 2025 11:51:03 +0100
Subject: [PATCH 04/16] avcodec/ppc/mpegvideo_altivec: Split intra/inter
 unquantizing

Don't use a single function that checks mb_intra. Forgotten
in d50635cd247e17fe16c63219b9ae80d45a8185b1.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/ppc/mpegvideo_altivec.c | 71 ++++++++++++++----------------
 1 file changed, 33 insertions(+), 38 deletions(-)

diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index ad3a783a87..7b54de3d91 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -40,41 +40,14 @@
 
 /* AltiVec version of dct_unquantize_h263
    this code assumes `block' is 16 bytes-aligned */
-static void dct_unquantize_h263_altivec(const MPVContext *s,
-                                        int16_t *block, int n, int qscale)
+static av_always_inline
+void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int 
qmul)
 {
-    int i, qmul, qadd;
-    int nCoeffs;
-
-    qadd = (qscale - 1) | 1;
-    qmul = qscale << 1;
-
-    if (s->mb_intra) {
-        if (!s->h263_aic) {
-            if (n < 4)
-                block[0] = block[0] * s->y_dc_scale;
-            else
-                block[0] = block[0] * s->c_dc_scale;
-        }else
-            qadd = 0;
-        i = 1;
-        if (s->ac_pred)
-            nCoeffs = 63;
-        else
-            nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
-    } else {
-        i = 0;
-        av_assert2(s->block_last_index[n]>=0);
-        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
-    }
-
-    {
         register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
         DECLARE_ALIGNED(16, short, qmul8) = qmul;
         DECLARE_ALIGNED(16, short, qadd8) = qadd;
         register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
         register vector bool short blockv_null, blockv_neg;
-        register short backup_0 = block[0];
 
         qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
         qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
@@ -82,7 +55,7 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
 
         // vectorize all the 16 bytes-aligned blocks
         // of 8 elements
-        for (register int j = 0; j <= nCoeffs ; j += 8) {
+        for (register int j = 0; j <= nb_coeffs; j += 8) {
             blockv = vec_ld(j << 1, block);
             blockv_neg = vec_cmplt(blockv, vczero);
             blockv_null = vec_cmpeq(blockv, vczero);
@@ -94,14 +67,36 @@ static void dct_unquantize_h263_altivec(const MPVContext *s,
             blockv = vec_sel(temp1, blockv, blockv_null);
             vec_st(blockv, j << 1, block);
         }
-
-        if (i == 1) {
-            // cheat. this avoid special-casing the first iteration
-            block[0] = backup_0;
-        }
-    }
 }
 
+static void dct_unquantize_h263_intra_altivec(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
+{
+    int qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    int block0 = block[0];
+    if (!s->h263_aic) {
+        block0 *= n < 4 ? s->y_dc_scale : s->c_dc_scale;
+    } else
+        qadd = 0;
+    int nb_coeffs = s->ac_pred ? 63 : 
s->intra_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+
+    // cheat. this avoid special-casing the first iteration
+    block[0] = block0;
+}
+
+static void dct_unquantize_h263_inter_altivec(const MPVContext *s,
+                                              int16_t *block, int n, int 
qscale)
+{
+    int qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    av_assert2(s->block_last_index[n]>=0);
+    int nb_coeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+
+    dct_unquantize_h263_altivec(block, nb_coeffs, qadd, qmul);
+}
 #endif /* HAVE_ALTIVEC */
 
 av_cold void ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
@@ -110,7 +105,7 @@ av_cold void 
ff_mpv_unquantize_init_ppc(MPVUnquantDSPContext *s, int bitexact)
     if (!PPC_ALTIVEC(av_get_cpu_flags()))
         return;
 
-    s->dct_unquantize_h263_intra = dct_unquantize_h263_altivec;
-    s->dct_unquantize_h263_inter = dct_unquantize_h263_altivec;
+    s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_altivec;
+    s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_altivec;
 #endif /* HAVE_ALTIVEC */
 }
-- 
2.49.1


>From 16bba08b72276e68cc8c67e247d4f504cc920696 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Wed, 19 Nov 2025 12:00:08 +0100
Subject: [PATCH 05/16] avcodec/ppc/mpegvideo_altivec: Reindent after the
 previous commit

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/ppc/mpegvideo_altivec.c | 44 +++++++++++++++---------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/libavcodec/ppc/mpegvideo_altivec.c 
b/libavcodec/ppc/mpegvideo_altivec.c
index 7b54de3d91..71894e760b 100644
--- a/libavcodec/ppc/mpegvideo_altivec.c
+++ b/libavcodec/ppc/mpegvideo_altivec.c
@@ -43,30 +43,30 @@
 static av_always_inline
 void dct_unquantize_h263_altivec(int16_t *block, int nb_coeffs, int qadd, int 
qmul)
 {
-        register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
-        DECLARE_ALIGNED(16, short, qmul8) = qmul;
-        DECLARE_ALIGNED(16, short, qadd8) = qadd;
-        register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
-        register vector bool short blockv_null, blockv_neg;
+    register const vector signed short vczero = (const vector signed 
short)vec_splat_s16(0);
+    DECLARE_ALIGNED(16, short, qmul8) = qmul;
+    DECLARE_ALIGNED(16, short, qadd8) = qadd;
+    register vector signed short blockv, qmulv, qaddv, nqaddv, temp1;
+    register vector bool short blockv_null, blockv_neg;
 
-        qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
-        qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
-        nqaddv = vec_sub(vczero, qaddv);
+    qmulv = vec_splat((vec_s16)vec_lde(0, &qmul8), 0);
+    qaddv = vec_splat((vec_s16)vec_lde(0, &qadd8), 0);
+    nqaddv = vec_sub(vczero, qaddv);
 
-        // vectorize all the 16 bytes-aligned blocks
-        // of 8 elements
-        for (register int j = 0; j <= nb_coeffs; j += 8) {
-            blockv = vec_ld(j << 1, block);
-            blockv_neg = vec_cmplt(blockv, vczero);
-            blockv_null = vec_cmpeq(blockv, vczero);
-            // choose between +qadd or -qadd as the third operand
-            temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
-            // multiply & add (block{i,i+7} * qmul [+-] qadd)
-            temp1 = vec_mladd(blockv, qmulv, temp1);
-            // put 0 where block[{i,i+7} used to have 0
-            blockv = vec_sel(temp1, blockv, blockv_null);
-            vec_st(blockv, j << 1, block);
-        }
+    // vectorize all the 16 bytes-aligned blocks
+    // of 8 elements
+    for (register int j = 0; j <= nb_coeffs; j += 8) {
+        blockv = vec_ld(j << 1, block);
+        blockv_neg = vec_cmplt(blockv, vczero);
+        blockv_null = vec_cmpeq(blockv, vczero);
+        // choose between +qadd or -qadd as the third operand
+        temp1 = vec_sel(qaddv, nqaddv, blockv_neg);
+        // multiply & add (block{i,i+7} * qmul [+-] qadd)
+        temp1 = vec_mladd(blockv, qmulv, temp1);
+        // put 0 where block[{i,i+7} used to have 0
+        blockv = vec_sel(temp1, blockv, blockv_null);
+        vec_st(blockv, j << 1, block);
+    }
 }
 
 static void dct_unquantize_h263_intra_altivec(const MPVContext *s,
-- 
2.49.1


>From f9f5f576754622924a29647495975692dc0a181b Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 28 Nov 2025 16:58:44 +0100
Subject: [PATCH 06/16] avcodec/{arm,neon}/mpegvideo: Use intra scantable to
 unquant H263 intra

Forgotten in 70a7df049c411d9247eb6075720c84196c3e55e8.

Using the wrong scantable matters for codecs for which both scantables
can differ, namely the MPEG-4 decoder and the WMV1/2 codecs.

For WMV1 it can lead to wrong output in case the IDCT permutation
is FF_IDCT_PERM_PARTTRANS, because in this case the entries of
of the intra scantable's raster end are not always <= the corresponding
entries of the inter scantable's raster end when the former is
initialized via ff_wmv1_scantable[1] and the latter via ff_wmv1_scantable[0].
FF_IDCT_PERM_PARTTRANS is used iff the Neon IDCT is used (for both arm
and aarch64).* Said IDCT is not used during FATE, so that this issue
went unnoticed.

WMV2 uses the same scantables, but uses a custom IDCT
which always uses FF_IDCT_PERM_NONE for which the inter_scantable,
so that the output is always correct for it.

The scantable for MPEG-4 can change mid-stream (for the decoder),
but since c41818dc5dc14eb944761204e7b0ac179a6dcd1a only the intra
scantable is updated, so that both scantables can get out of sync.
In such a case the unquantize intra functions could unquantize
an incorrect number of coefficients.

Using raster_end of the wrong scantable can also lead to an
unnecessarily large amount of coefficients unquantized.

*: FF_IDCT_PERM_SIMPLE and FF_IDCT_PERM_TRANSPOSE would also not work,
but they are not used at all by arm and aarch64.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/arm/asm-offsets.h       | 1 +
 libavcodec/arm/mpegvideo_arm.c     | 2 ++
 libavcodec/arm/mpegvideo_armv5te.c | 2 +-
 libavcodec/arm/mpegvideo_neon.S    | 2 +-
 libavcodec/mpegvideo.h             | 2 +-
 libavcodec/neon/mpegvideo.c        | 2 +-
 6 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/libavcodec/arm/asm-offsets.h b/libavcodec/arm/asm-offsets.h
index a2174b0a08..67e1f2ff6d 100644
--- a/libavcodec/arm/asm-offsets.h
+++ b/libavcodec/arm/asm-offsets.h
@@ -28,5 +28,6 @@
 #define BLOCK_LAST_INDEX         0x10
 #define H263_AIC                 0x40
 #define INTER_SCANTAB_RASTER_END 0x88
+#define INTRA_SCANTAB_RASTER_END 0x10c
 
 #endif /* AVCODEC_ARM_ASM_OFFSETS_H */
diff --git a/libavcodec/arm/mpegvideo_arm.c b/libavcodec/arm/mpegvideo_arm.c
index cb109cd832..593e998181 100644
--- a/libavcodec/arm/mpegvideo_arm.c
+++ b/libavcodec/arm/mpegvideo_arm.c
@@ -38,6 +38,8 @@ CHECK_OFFSET(MpegEncContext, ac_pred,          AC_PRED);
 CHECK_OFFSET(MpegEncContext, block_last_index, BLOCK_LAST_INDEX);
 CHECK_OFFSET(MpegEncContext, inter_scantable.raster_end,
              INTER_SCANTAB_RASTER_END);
+CHECK_OFFSET(MpegEncContext, intra_scantable.raster_end,
+             INTRA_SCANTAB_RASTER_END);
 CHECK_OFFSET(MpegEncContext, h263_aic,         H263_AIC);
 #endif
 
diff --git a/libavcodec/arm/mpegvideo_armv5te.c 
b/libavcodec/arm/mpegvideo_armv5te.c
index 3a6d015767..b2790b48fe 100644
--- a/libavcodec/arm/mpegvideo_armv5te.c
+++ b/libavcodec/arm/mpegvideo_armv5te.c
@@ -73,7 +73,7 @@ static void dct_unquantize_h263_intra_armv5te(const 
MPVContext *s,
     if(s->ac_pred)
         nCoeffs=63;
     else
-        nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
 
     ff_dct_unquantize_h263_armv5te(block, qmul, qadd, nCoeffs + 1);
     block[0] = level;
diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index 1889d7a912..c7a35ea267 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -77,7 +77,7 @@ function ff_dct_unquantize_h263_intra_neon, export=1
         push            {r4-r6,lr}
         add             r12, r0,  #BLOCK_LAST_INDEX
         ldr             r6,  [r0, #AC_PRED]
-        add             lr,  r0,  #INTER_SCANTAB_RASTER_END
+        add             lr,  r0,  #INTRA_SCANTAB_RASTER_END
         cmp             r6,  #0
         it              ne
         movne           r12, #63
diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index e21ce5164d..758bf57ab9 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -72,11 +72,11 @@ typedef struct MpegEncContext {
 
     /* scantables */
     ScanTable inter_scantable; ///< if inter == intra then intra should be 
used to reduce the cache usage
+    ScanTable intra_scantable;
 
     /* WARNING: changes above this line require updates to hardcoded
      *          offsets used in ASM. */
 
-    ScanTable intra_scantable;
     uint8_t permutated_intra_h_scantable[64];
     uint8_t permutated_intra_v_scantable[64];
 
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index fdc57d3876..3427dbe427 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -112,7 +112,7 @@ static void dct_unquantize_h263_intra_neon(const MPVContext 
*s, int16_t *block,
     if (s->ac_pred) {
         nCoeffs = 63;
     } else {
-        nCoeffs = s->inter_scantable.raster_end[s->block_last_index[n]];
+        nCoeffs = s->intra_scantable.raster_end[s->block_last_index[n]];
         if (nCoeffs <= 0)
             return;
     }
-- 
2.49.1


>From 04542ba1eb6334bc7bb13bd8d85e118a7617aab0 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Fri, 28 Nov 2025 22:25:39 +0100
Subject: [PATCH 07/16] tests/checkasm: Add mpegvideo unquantize test

This adds a test for the mpegvideo unquantize functions.

It has been written in order to be able to easily bench
these functions. It should be noted that the random input
fed to the tested functions is not necessarily representative
of the stuff actually occuring in the wild. So benchmarks should
be taken with a grain of salt; but comparisons between two functions
that do not depend on branch predictions are valid (the usecase
for this is to port the x86 mmx functions to use xmm registers).

During testing I have found a bug in the arm/aarch64 neon optimizations
when using the LIBMPEG2 permutation (used by FF_IDCT_INT): The code
seems to be based on the presumption that the remainder of the number
of coefficients to process is always <= 4 mod 16. The test therefore
sometimes fails for these arches.

Hint: I am not certain that 16 bits are enough for the intermediate
values of all the computations involved; e.g. both FLV and MPEG-4
escape values can go beyond that after the corresponding
multiplications. The input in this test is nevertheless designed
to fit into 16 bits.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 tests/checkasm/Makefile               |   1 +
 tests/checkasm/checkasm.c             |   3 +
 tests/checkasm/checkasm.h             |   1 +
 tests/checkasm/mpegvideo_unquantize.c | 273 ++++++++++++++++++++++++++
 tests/fate/checkasm.mak               |   1 +
 5 files changed, 279 insertions(+)
 create mode 100644 tests/checkasm/mpegvideo_unquantize.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 3762c0d83b..b9c8adb21f 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -19,6 +19,7 @@ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
 AVCODECOBJS-$(CONFIG_LPC)               += lpc.o
 AVCODECOBJS-$(CONFIG_ME_CMP)            += motion.o
+AVCODECOBJS-$(CONFIG_MPEGVIDEO)         += mpegvideo_unquantize.o
 AVCODECOBJS-$(CONFIG_MPEGVIDEOENCDSP)   += mpegvideoencdsp.o
 AVCODECOBJS-$(CONFIG_QPELDSP)           += qpeldsp.o
 AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 8c64684fa3..a899967937 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -215,6 +215,9 @@ static const struct {
     #if CONFIG_ME_CMP
         { "motion", checkasm_check_motion },
     #endif
+    #if CONFIG_MPEGVIDEO
+        { "mpegvideo_unquantize", checkasm_check_mpegvideo_unquantize },
+    #endif
     #if CONFIG_MPEGVIDEOENCDSP
         { "mpegvideoencdsp", checkasm_check_mpegvideoencdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index bd33aba263..72474da2a8 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -123,6 +123,7 @@ void checkasm_check_llviddsp(void);
 void checkasm_check_llviddspenc(void);
 void checkasm_check_lpc(void);
 void checkasm_check_motion(void);
+void checkasm_check_mpegvideo_unquantize(void);
 void checkasm_check_mpegvideoencdsp(void);
 void checkasm_check_nlmeans(void);
 void checkasm_check_opusdsp(void);
diff --git a/tests/checkasm/mpegvideo_unquantize.c 
b/tests/checkasm/mpegvideo_unquantize.c
new file mode 100644
index 0000000000..837606e60e
--- /dev/null
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -0,0 +1,273 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config.h"
+
+#include "checkasm.h"
+
+#include "libavcodec/idctdsp.h"
+#include "libavcodec/mathops.h"
+#include "libavcodec/mpegvideo.h"
+#include "libavcodec/mpegvideodata.h"
+#include "libavcodec/mpegvideo_unquantize.h"
+
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#define randomize_struct(TYPE, s) do {                    \
+    static_assert(!(_Alignof(TYPE) % 4),                  \
+                  "can't use aligned stores");            \
+    unsigned char *ptr = (unsigned char*)s;               \
+    for (size_t i = 0; i < sizeof(*s) & ~3; i += 4)       \
+        AV_WN32A(ptr + i, rnd());                         \
+    for (size_t i = sizeof(*s) & ~3; i < sizeof(*s); ++i) \
+        ptr[i] = rnd();                                   \
+   } while (0)
+
+enum TestType {
+    H263,
+    MPEG1,
+    MPEG2,
+};
+
+static void init_idct_scantable(MPVContext *const s, int intra_scantable)
+{
+    static const enum idct_permutation_type permutation_types[] = {
+        FF_IDCT_PERM_NONE,
+        FF_IDCT_PERM_LIBMPEG2,
+#if ARCH_X86_32 && HAVE_X86ASM
+        FF_IDCT_PERM_SIMPLE,
+#endif
+#if ARCH_PPC || ARCH_X86
+        FF_IDCT_PERM_TRANSPOSE,
+#endif
+#if ARCH_ARM || ARCH_AARCH64
+        FF_IDCT_PERM_PARTTRANS,
+#endif
+#if ARCH_X86 && HAVE_X86ASM
+        FF_IDCT_PERM_SSE2,
+#endif
+    };
+    // Copied here to avoid #ifs.
+    static const uint8_t ff_wmv1_scantable[][64] = {
+    { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+      0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+      0x30, 0x38, 0x29, 0x21, 0x1A, 0x13, 0x0C, 0x05,
+      0x06, 0x0D, 0x14, 0x1B, 0x22, 0x31, 0x39, 0x3A,
+      0x32, 0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F,
+      0x16, 0x1D, 0x24, 0x2B, 0x33, 0x3B, 0x3C, 0x34,
+      0x2C, 0x25, 0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x35,
+      0x3D, 0x3E, 0x36, 0x2E, 0x27, 0x2F, 0x37, 0x3F, },
+    { 0x00, 0x08, 0x01, 0x02, 0x09, 0x10, 0x18, 0x11,
+      0x0A, 0x03, 0x04, 0x0B, 0x12, 0x19, 0x20, 0x28,
+      0x21, 0x30, 0x1A, 0x13, 0x0C, 0x05, 0x06, 0x0D,
+      0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39, 0x2A,
+      0x23, 0x1C, 0x15, 0x0E, 0x07, 0x0F, 0x16, 0x1D,
+      0x24, 0x2B, 0x32, 0x3A, 0x33, 0x3B, 0x2C, 0x25,
+      0x1E, 0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3C, 0x35,
+      0x3D, 0x2E, 0x27, 0x2F, 0x36, 0x3E, 0x37, 0x3F, },
+    { 0x00, 0x01, 0x08, 0x02, 0x03, 0x09, 0x10, 0x18,
+      0x11, 0x0A, 0x04, 0x05, 0x0B, 0x12, 0x19, 0x20,
+      0x28, 0x30, 0x21, 0x1A, 0x13, 0x0C, 0x06, 0x07,
+      0x0D, 0x14, 0x1B, 0x22, 0x29, 0x38, 0x31, 0x39,
+      0x2A, 0x23, 0x1C, 0x15, 0x0E, 0x0F, 0x16, 0x1D,
+      0x24, 0x2B, 0x32, 0x3A, 0x33, 0x2C, 0x25, 0x1E,
+      0x17, 0x1F, 0x26, 0x2D, 0x34, 0x3B, 0x3C, 0x35,
+      0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, },
+    { 0x00, 0x08, 0x10, 0x01, 0x18, 0x20, 0x28, 0x09,
+      0x02, 0x03, 0x0A, 0x11, 0x19, 0x30, 0x38, 0x29,
+      0x21, 0x1A, 0x12, 0x0B, 0x04, 0x05, 0x0C, 0x13,
+      0x1B, 0x22, 0x31, 0x39, 0x32, 0x2A, 0x23, 0x1C,
+      0x14, 0x0D, 0x06, 0x07, 0x0E, 0x15, 0x1D, 0x24,
+      0x2B, 0x33, 0x3A, 0x3B, 0x34, 0x2C, 0x25, 0x1E,
+      0x16, 0x0F, 0x17, 0x1F, 0x26, 0x2D, 0x3C, 0x35,
+      0x2E, 0x27, 0x2F, 0x36, 0x3D, 0x3E, 0x37, 0x3F, }
+    };
+
+    static const uint8_t *const scantables[] = {
+        ff_alternate_vertical_scan,
+        ff_alternate_horizontal_scan,
+        ff_zigzag_direct,
+        ff_wmv1_scantable[0],
+        ff_wmv1_scantable[1],
+        ff_wmv1_scantable[2],
+        ff_wmv1_scantable[3],
+    };
+    static const uint8_t *scantable = NULL;
+    static enum idct_permutation_type idct_permutation;
+
+    if (!scantable) {
+        scantable        = scantables[rnd() % FF_ARRAY_ELEMS(scantables)];
+        idct_permutation = permutation_types[rnd() % 
FF_ARRAY_ELEMS(permutation_types)];
+    }
+    ff_init_scantable_permutation(s->idsp.idct_permutation, idct_permutation);
+    ff_init_scantable(s->idsp.idct_permutation,
+                      intra_scantable ? &s->intra_scantable : 
&s->inter_scantable,
+                      scantable);
+}
+
+static void init_h263_test(MPVContext *const s, int16_t block[64],
+                           int last_nonzero_coeff, int qscale, int intra)
+{
+    const uint8_t *permutation = s->inter_scantable.permutated;
+    if (intra) {
+        permutation = s->intra_scantable.permutated;
+        block[0]    = rnd() & 511;
+        static int h263_aic = -1, ac_pred;
+        if (h263_aic < 0) {
+            h263_aic = rnd() & 1;
+            ac_pred  = rnd() & 1;
+        }
+        s->h263_aic = h263_aic;
+        s->ac_pred  = ac_pred;
+        if (s->ac_pred)
+            last_nonzero_coeff = 63;
+    }
+    for (int i = intra; i <= last_nonzero_coeff; ++i) {
+        int random = rnd();
+        if (random & 1)
+            continue;
+        random >>= 1;
+        // Select level so that the multiplication fits into 16 bits.
+        // FIXME: The FLV and MPEG-4 decoders can have escape values exceeding 
this.
+        block[permutation[i]] = sign_extend(random, 10);
+    }
+}
+
+static void init_mpeg12_test(MPVContext *const s, int16_t block[64],
+                             int last_nonzero_coeff, int qscale, int intra,
+                             enum TestType type)
+{
+    uint16_t *matrix = intra ? s->intra_matrix : s->inter_matrix;
+
+    if (type == MPEG2)
+        qscale = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : qscale 
<< 1;
+
+    for (int i = 0; i < 64; ++i)
+        matrix[i] = 1 + rnd() % 254;
+
+    const uint8_t *permutation = s->intra_scantable.permutated;
+    if (intra) {
+        block[0] = (int8_t)rnd();
+        for (int i = 1; i <= last_nonzero_coeff; ++i) {
+            int j = permutation[i];
+            unsigned random = rnd();
+            if (random & 1)
+                continue;
+            random >>= 1;
+            // Select level so that the multiplication does not overflow
+            // an int16_t and so that it is within the possible range
+            // (-2048..2047). FIXME: It seems that this need not be fulfilled
+            // in practice for the MPEG-4 decoder at least.
+            int limit = FFMIN(INT16_MAX / (qscale * matrix[j]), 2047);
+            block[j] = random % (2 * limit + 1) - limit;
+        }
+    } else {
+        for (int i = 0; i <= last_nonzero_coeff; ++i) {
+            int j = permutation[i];
+            unsigned random = rnd();
+            if (random & 1)
+                continue;
+            random >>= 1;
+            int limit = FFMIN((INT16_MAX / (qscale * matrix[j]) - 1) / 2, 
2047);
+            block[j] = random % (2 * limit + 1) - limit;
+        }
+    }
+}
+
+void checkasm_check_mpegvideo_unquantize(void)
+{
+    static const struct {
+        const char *name;
+        size_t offset;
+        int intra, intra_scantable;
+        enum TestType type;
+    } tests[] = {
+#define TEST(NAME, INTRA, INTRA_SCANTABLE, TYPE)                         \
+    { .name = #NAME, .offset = offsetof(MPVUnquantDSPContext, NAME),     \
+      .intra = INTRA, .intra_scantable = INTRA_SCANTABLE, .type = TYPE }
+        TEST(dct_unquantize_mpeg1_intra, 1, 1, MPEG1),
+        TEST(dct_unquantize_mpeg1_inter, 0, 1, MPEG1),
+        TEST(dct_unquantize_mpeg2_intra, 1, 1, MPEG2),
+        TEST(dct_unquantize_mpeg2_inter, 0, 1, MPEG2),
+        TEST(dct_unquantize_h263_intra,  1, 1, H263),
+        TEST(dct_unquantize_h263_inter,  0, 0, H263),
+    };
+    MPVUnquantDSPContext unquant_dsp_ctx;
+    int q_scale_type = rnd() & 1;
+
+    ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
+    declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block, 
int n, int qscale);
+
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+        void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =
+            *(void (**)(MPVContext *, int16_t *, int, 
int))((char*)&unquant_dsp_ctx + tests[i].offset);
+        if (check_func(func, "%s", tests[i].name)) {
+            MPVContext new, ref;
+            DECLARE_ALIGNED(16, int16_t, block_new)[64];
+            DECLARE_ALIGNED(16, int16_t, block_ref)[64];
+            static int block_last_index = -1;
+
+            randomize_struct(MPVContext, &ref);
+
+            ref.q_scale_type = q_scale_type;
+
+            init_idct_scantable(&ref, tests[i].intra_scantable);
+
+            if (block_last_index < 0)
+                block_last_index = rnd() % 64;
+
+            memset(block_ref, 0, sizeof(block_ref));
+
+            if (tests[i].intra) {
+                // Less restricted than real dc_scale values
+                ref.y_dc_scale = 1 + rnd() % 64;
+                ref.c_dc_scale = 1 + rnd() % 64;
+            }
+
+            static int qscale = 0;
+
+            if (qscale == 0)
+                qscale = 1 + rnd() % 31;
+
+            if (tests[i].type == H263)
+                init_h263_test(&ref, block_ref, block_last_index, qscale,
+                               tests[i].intra);
+            else
+                init_mpeg12_test(&ref, block_ref, block_last_index, qscale,
+                                 tests[i].intra, tests[i].type);
+
+            int n = rnd() % 6;
+            ref.block_last_index[n] = block_last_index;
+
+            memcpy(&new, &ref, sizeof(new));
+            memcpy(block_new, block_ref, sizeof(block_new));
+
+            call_ref(&ref, block_ref, n, qscale);
+            call_new(&new, block_new, n, qscale);
+
+            if (memcmp(&ref, &new, sizeof(new)) || memcmp(block_new, 
block_ref, sizeof(block_new)))
+                fail();
+
+            bench_new(&new, block_new, n, qscale);
+        }
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index f182efde46..48edd17bf2 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -39,6 +39,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-llviddspenc                               \
                 fate-checkasm-lpc                                       \
                 fate-checkasm-motion                                    \
+                fate-checkasm-mpegvideo_unquantize                      \
                 fate-checkasm-mpegvideoencdsp                           \
                 fate-checkasm-opusdsp                                   \
                 fate-checkasm-pixblockdsp                               \
-- 
2.49.1


>From 8ba93acc339c209d522b70cfed0312f3ab2f2238 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 29 Nov 2025 01:05:51 +0100
Subject: [PATCH 08/16] avcodec/{arm,neon}/mpegvideo: Fix h263 unquantize
 functions

These functions currently operate on the assumption that the number
of coefficients to process is always of the form 16k+m with m<=4 or >8.
Yet this is not true when the IDCT permutation is of type FF_IDCT_PERM_LIBMPEG2
(i.e. when FF_IDCT_INT is in use).

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/arm/mpegvideo_neon.S | 18 +++++++++---------
 libavcodec/neon/mpegvideo.c     | 22 ++++++++--------------
 2 files changed, 17 insertions(+), 23 deletions(-)

diff --git a/libavcodec/arm/mpegvideo_neon.S b/libavcodec/arm/mpegvideo_neon.S
index c7a35ea267..7e42bdf6c5 100644
--- a/libavcodec/arm/mpegvideo_neon.S
+++ b/libavcodec/arm/mpegvideo_neon.S
@@ -36,7 +36,7 @@ function ff_dct_unquantize_h263_neon, export=1
         vdup.16         q15, r0                 @ qmul
         vdup.16         q14, r2                 @ qadd
         vneg.s16        q13, q14
-        cmp             r3,  #4
+        cmp             r3,  #8
         mov             r0,  r1
         ble             2f
 1:
@@ -62,14 +62,14 @@ function ff_dct_unquantize_h263_neon, export=1
         cmp             r3,  #8
         bgt             1b
 2:
-        vld1.16         {d0},     [r0,:64]
-        vclt.s16        d3,  d0,  #0
-        vceq.s16        d1,  d0,  #0
-        vmul.s16        d2,  d0,  d30
-        vbsl            d3,  d26, d28
-        vadd.s16        d2,  d2,  d3
-        vbif            d0,  d2,  d1
-        vst1.16         {d0},     [r1,:64]
+        vld1.16         {q0},     [r0,:128]
+        vclt.s16        q3,  q0,  #0
+        vceq.s16        q1,  q0,  #0
+        vmul.s16        q2,  q0,  q15
+        vbsl            q3,  q13, q14
+        vadd.s16        q2,  q2,  q3
+        vbif            q0,  q2,  q1
+        vst1.16         {q0},     [r1,:128]
         bx              lr
 endfunc
 
diff --git a/libavcodec/neon/mpegvideo.c b/libavcodec/neon/mpegvideo.c
index 3427dbe427..44e9b70303 100644
--- a/libavcodec/neon/mpegvideo.c
+++ b/libavcodec/neon/mpegvideo.c
@@ -39,12 +39,7 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
 {
     int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16;
     int16x8_t q14s16, q15s16, qzs16;
-    int16x4_t d0s16, d2s16, d3s16, dzs16;
     uint16x8_t q1u16, q9u16;
-    uint16x4_t d1u16;
-
-    dzs16 = vdup_n_s16(0);
-    qzs16 = vdupq_n_s16(0);
 
     q15s16 = vdupq_n_s16(qscale << 1);
     q14s16 = vdupq_n_s16(qadd);
@@ -73,15 +68,14 @@ static void inline ff_dct_unquantize_h263_neon(int qscale, 
int qadd, int nCoeffs
     if (nCoeffs <= 0)
         return;
 
-    d0s16 = vld1_s16(block);
-    d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16));
-    d1u16 = vceq_s16(d0s16, dzs16);
-    d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16));
-    d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16),
-                     vget_high_s16(q13s16), vget_high_s16(q14s16));
-    d2s16 = vadd_s16(d2s16, d3s16);
-    d0s16 = vbsl_s16(d1u16, d0s16, d2s16);
-    vst1_s16(block, d0s16);
+    q0s16 = vld1q_s16(block);
+    q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16));
+    q1u16 = vceqq_s16(q0s16, qzs16);
+    q2s16 = vmulq_s16(q0s16, q15s16);
+    q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16);
+    q2s16 = vaddq_s16(q2s16, q3s16);
+    q0s16 = vbslq_s16(q1u16, q0s16, q2s16);
+    vst1q_s16(block, q0s16);
 }
 
 static void dct_unquantize_h263_inter_neon(const MPVContext *s, int16_t *block,
-- 
2.49.1


>From e0f15f04798e68fb2aad6b60872127cfd3707e01 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 29 Nov 2025 01:17:08 +0100
Subject: [PATCH 09/16] avcodec/mpegvideo: Move ff_init_scantable() to
 mpegvideo_unquantize.c

This is necessary so that the mpegvideo_unquantize checkasm test
does not pull mpegvideo.o and then all of libavcodec into checkasm.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/mpegvideo.c            | 15 ---------------
 libavcodec/mpegvideo_unquantize.c | 14 ++++++++++++++
 2 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/libavcodec/mpegvideo.c b/libavcodec/mpegvideo.c
index a137fe31db..7ca2c8f701 100644
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -42,7 +42,6 @@
 #include "mpegutils.h"
 #include "mpegvideo.h"
 #include "mpegvideodata.h"
-#include "mpegvideo_unquantize.h"
 #include "libavutil/refstruct.h"
 
 
@@ -79,20 +78,6 @@ static av_cold void dsp_init(MpegEncContext *s)
     }
 }
 
-av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
-                               const uint8_t *src_scantable)
-{
-    st->scantable = src_scantable;
-
-    for (int i = 0, end = -1; i < 64; i++) {
-        int j = src_scantable[i];
-        st->permutated[i] = permutation[j];
-        if (permutation[j] > end)
-            end = permutation[j];
-        st->raster_end[i] = end;
-    }
-}
-
 av_cold void ff_mpv_idct_init(MpegEncContext *s)
 {
     if (s->codec_id == AV_CODEC_ID_MPEG4)
diff --git a/libavcodec/mpegvideo_unquantize.c 
b/libavcodec/mpegvideo_unquantize.c
index 06c29d0753..9297c80b47 100644
--- a/libavcodec/mpegvideo_unquantize.c
+++ b/libavcodec/mpegvideo_unquantize.c
@@ -33,6 +33,20 @@
 #include "mpegvideodata.h"
 #include "mpegvideo_unquantize.h"
 
+av_cold void ff_init_scantable(const uint8_t *permutation, ScanTable *st,
+                               const uint8_t *src_scantable)
+{
+    st->scantable = src_scantable;
+
+    for (int i = 0, end = -1; i < 64; i++) {
+        int j = src_scantable[i];
+        st->permutated[i] = permutation[j];
+        if (permutation[j] > end)
+            end = permutation[j];
+        st->raster_end[i] = end;
+    }
+}
+
 static void dct_unquantize_mpeg1_intra_c(const MPVContext *s,
                                          int16_t *block, int n, int qscale)
 {
-- 
2.49.1


>From b83bc44c8bbdc9a82fef2fa65df1fd8b65f66e78 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Sat, 29 Nov 2025 22:23:50 +0100
Subject: [PATCH 10/16] avcodec/x86/mpegvideo: Use correct inline assembly
 constraints

The H.263 unquantize functions modified an input parameter.
(And they did so since this code was added in
7f3f5ec87bcbf244fce49ffdb476d4ae6e523af6. I am surprised
that this didn't cause issues, particularly with the intra function.)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c | 64 +++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 4c3299362e..38dcd8fc6e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -33,9 +33,8 @@
 static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
                                           int16_t *block, int n, int qscale)
 {
-    x86_reg level, qmul, qadd, nCoeffs;
-
-    qmul = qscale << 1;
+    x86_reg qmul = (unsigned)qscale << 1;
+    int level, qadd;
 
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
@@ -49,16 +48,15 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext 
*s,
         qadd = 0;
         level= block[0];
     }
-    if(s->ac_pred)
-        nCoeffs=63;
-    else
-        nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
+    x86_reg offset = s->ac_pred ? 63 << 1 : 
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
 
 __asm__ volatile(
-                "movd %1, %%mm6                 \n\t" //qmul
+                "movd          %k1, %%mm6       \n\t" //qmul
+                "lea      (%2, %0), %1          \n\t"
+                "neg            %0              \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "movd %2, %%mm5                 \n\t" //qadd
+                "movd           %3, %%mm5       \n\t" //qadd
                 "pxor %%mm7, %%mm7              \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
@@ -66,14 +64,14 @@ __asm__ volatile(
                 "pxor %%mm4, %%mm4              \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %3), %%mm0           \n\t"
-                "movq 8(%0, %3), %%mm1          \n\t"
+                "movq     (%1, %0), %%mm0       \n\t"
+                "movq    8(%1, %0), %%mm1       \n\t"
 
                 "pmullw %%mm6, %%mm0            \n\t"
                 "pmullw %%mm6, %%mm1            \n\t"
 
-                "movq (%0, %3), %%mm2           \n\t"
-                "movq 8(%0, %3), %%mm3          \n\t"
+                "movq     (%1, %0), %%mm2       \n\t"
+                "movq    8(%1, %0), %%mm3       \n\t"
 
                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
@@ -93,12 +91,13 @@ __asm__ volatile(
                 "pandn %%mm2, %%mm0             \n\t"
                 "pandn %%mm3, %%mm1             \n\t"
 
-                "movq %%mm0, (%0, %3)           \n\t"
-                "movq %%mm1, 8(%0, %3)          \n\t"
+                "movq        %%mm0, (%1, %0)    \n\t"
+                "movq        %%mm1, 8(%1, %0)   \n\t"
 
-                "add $16, %3                    \n\t"
+                "add           $16, %0          \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" 
(2*(-nCoeffs))
+                : "+r"(offset), "+r"(qmul)
+                : "r" (block), "rm" (qadd)
                 : "memory"
         );
         block[0]= level;
@@ -108,20 +107,20 @@ __asm__ volatile(
 static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
                                           int16_t *block, int n, int qscale)
 {
-    x86_reg qmul, qadd, nCoeffs;
-
-    qmul = qscale << 1;
-    qadd = (qscale - 1) | 1;
+    int qmul = qscale << 1;
+    int qadd = (qscale - 1) | 1;
 
     av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
-    nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
+    x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 
1;
 
 __asm__ volatile(
-                "movd %1, %%mm6                 \n\t" //qmul
+                "movd           %2, %%mm6       \n\t" //qmul
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "movd %2, %%mm5                 \n\t" //qadd
+                "movd           %3, %%mm5       \n\t" //qadd
+                "add            %1, %0          \n\t"
+                "neg            %1              \n\t"
                 "pxor %%mm7, %%mm7              \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
                 "packssdw %%mm5, %%mm5          \n\t"
@@ -129,14 +128,14 @@ __asm__ volatile(
                 "pxor %%mm4, %%mm4              \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %3), %%mm0           \n\t"
-                "movq 8(%0, %3), %%mm1          \n\t"
+                "movq     (%0, %1), %%mm0       \n\t"
+                "movq    8(%0, %1), %%mm1       \n\t"
 
                 "pmullw %%mm6, %%mm0            \n\t"
                 "pmullw %%mm6, %%mm1            \n\t"
 
-                "movq (%0, %3), %%mm2           \n\t"
-                "movq 8(%0, %3), %%mm3          \n\t"
+                "movq     (%0, %1), %%mm2       \n\t"
+                "movq    8(%0, %1), %%mm3       \n\t"
 
                 "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
                 "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
@@ -156,12 +155,13 @@ __asm__ volatile(
                 "pandn %%mm2, %%mm0             \n\t"
                 "pandn %%mm3, %%mm1             \n\t"
 
-                "movq %%mm0, (%0, %3)           \n\t"
-                "movq %%mm1, 8(%0, %3)          \n\t"
+                "movq        %%mm0, (%0, %1)    \n\t"
+                "movq        %%mm1, 8(%0, %1)   \n\t"
 
-                "add $16, %3                    \n\t"
+                "add           $16, %1          \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" 
(2*(-nCoeffs))
+                : "+r" (block), "+r" (offset)
+                : "rm"(qmul), "rm" (qadd)
                 : "memory"
         );
 }
-- 
2.49.1


>From e084623582aac2b4356014198bf12f6ca0f22c6e Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 7 Oct 2025 10:35:08 +0200
Subject: [PATCH 11/16] avcodec/x86/mpegvideo: Improve unquantizing MPEG-2
 intra blocks

Unquantizing involves calculating
    (block[j] * qscale * quant_matrix[j]) / 16
where / rounds towards zero. Arithmetic right shifts
naturally round towards -inf, so the earlier code
calculated the absolute value first, then used a right-shift
and then negated the result if necessary.

This commit uses a different procedure: It biases the product
for negative values of block[j] by 0xf. The combination of
this and the arithmetic right shift is the same as rounding
towards zero.

Furthermore, a write-only store to mm7 has been removed.

Benchmarks:
dct_unquantize_mpeg2_intra_c:                          214.3 ( 1.00x)
dct_unquantize_mpeg2_intra_mmx (old):                   43.0 ( 4.98x)
dct_unquantize_mpeg2_intra_mmx (new):                   28.4 ( 7.56x)

(The bitexact flag and the test for correctness have beem removed
from checkasm for the benchmarks.)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 38dcd8fc6e..d1614eb1eb 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -321,8 +321,6 @@ static void dct_unquantize_mpeg2_intra_mmx(const MPVContext 
*s,
         block0 = block[0] * s->c_dc_scale;
     quant_matrix = s->intra_matrix;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
                 "movd %2, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
@@ -335,30 +333,18 @@ __asm__ volatile(
                 "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
-                "psraw $4, %%mm0                \n\t"
-                "psraw $4, %%mm1                \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm0, %%mm2              \n\t"
+                "movq %%mm1, %%mm3              \n\t"
+                "psrlw $12, %%mm2               \n\t" // block[i] < 0 ? 0xf : 0
+                "psrlw $12, %%mm3               \n\t" // (block[i] is in the 
-2048..2047 range)
+                "pmullw %%mm4, %%mm0            \n\t" // block[i]*q
+                "pmullw %%mm5, %%mm1            \n\t" // block[i]*q
+                "paddw %%mm2, %%mm0             \n\t" // bias negative block[i]
+                "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
+                "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
+                "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
+                "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
+                "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
 
                 "add $16, %%"FF_REG_a"          \n\t"
                 "jng 1b                         \n\t"
-- 
2.49.1


>From 46cb80b82d831835ef116a8327569f0151e16122 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 3 Nov 2025 19:17:16 +0100
Subject: [PATCH 12/16] avcodec/x86/mpegvideo: Don't duplicate register

Currently several inline ASM blocks used a value as
an input and rax as clobber register. The input value
was just moved into the register which then served as loop
counter. This is wasteful, as one can just use the value's
register directly as loop counter.

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c | 119 +++++++++++++++++++------------------
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index d1614eb1eb..aa15e2b32a 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -183,19 +183,19 @@ static void dct_unquantize_mpeg1_intra_mmx(const 
MPVContext *s,
         block0 = block[0] * s->c_dc_scale;
     /* XXX: only MPEG-1 */
     quant_matrix = s->intra_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlw $15, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -210,8 +210,8 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
                 "psraw $3, %%mm0                \n\t"
                 "psraw $3, %%mm1                \n\t"
                 "psubw %%mm7, %%mm0             \n\t"
@@ -224,13 +224,14 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%1, %0)           \n\t"
+                "movq %%mm5, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "js 1b                          \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
     block[0]= block0;
 }
@@ -246,19 +247,19 @@ static void dct_unquantize_mpeg1_inter_mmx(const 
MPVContext *s,
     nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
         quant_matrix = s->inter_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlw $15, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -277,8 +278,8 @@ __asm__ volatile(
                 "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 
1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
                 "psraw $4, %%mm0                \n\t"
                 "psraw $4, %%mm1                \n\t"
                 "psubw %%mm7, %%mm0             \n\t"
@@ -291,13 +292,14 @@ __asm__ volatile(
                 "psubw %%mm3, %%mm1             \n\t"
                 "pandn %%mm0, %%mm4             \n\t"
                 "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm4, (%1, %0)           \n\t"
+                "movq %%mm5, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "js 1b                          \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
 }
 
@@ -320,17 +322,17 @@ static void dct_unquantize_mpeg2_intra_mmx(const 
MPVContext *s,
     else
         block0 = block[0] * s->c_dc_scale;
     quant_matrix = s->intra_matrix;
+    x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "movd %2, %%mm6                 \n\t"
+                "movd %3, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq (%1, %0), %%mm0           \n\t"
+                "movq 8(%1, %0), %%mm1          \n\t"
+                "movq (%2, %0), %%mm4           \n\t"
+                "movq 8(%2, %0), %%mm5          \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "movq %%mm0, %%mm2              \n\t"
@@ -343,13 +345,14 @@ __asm__ volatile(
                 "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
                 "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
                 "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
-                "movq %%mm0, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm1, 8(%0, %%"FF_REG_a")\n\t"
+                "movq %%mm0, (%1, %0)           \n\t"
+                "movq %%mm1, 8(%1, %0)          \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add $16, %0                    \n\t"
                 "jng 1b                         \n\t"
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "g" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r" (offset)
+                : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
+                : "memory"
         );
     block[0]= block0;
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
@@ -358,30 +361,27 @@ __asm__ volatile(
 static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
 {
-    x86_reg nCoeffs;
-    const uint16_t *quant_matrix;
-
     av_assert2(s->block_last_index[n]>=0);
 
-    if (s->q_scale_type) qscale = ff_mpeg2_non_linear_qscale[qscale];
-    else                 qscale <<= 1;
+    x86_reg qscale2 = s->q_scale_type ? ff_mpeg2_non_linear_qscale[qscale] : 
(unsigned)qscale << 1;
+    x86_reg offset  = s->intra_scantable.raster_end[s->block_last_index[n]] << 
1;
+    const void *quant_matrix = (const char*)s->inter_matrix + offset;
 
-    nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
-        quant_matrix = s->inter_matrix;
 __asm__ volatile(
+                "movd          %k1, %%mm6      \n\t"
+                "lea      (%2, %0), %1         \n\t"
+                "neg            %0             \n\t"
                 "pcmpeqw %%mm7, %%mm7           \n\t"
                 "psrlq $48, %%mm7               \n\t"
-                "movd %2, %%mm6                 \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
                 "packssdw %%mm6, %%mm6          \n\t"
-                "mov %3, %%"FF_REG_a"           \n\t"
                 ".p2align 4                     \n\t"
                 "1:                             \n\t"
-                "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
-                "movq 8(%0, %%"FF_REG_a"), %%mm1\n\t"
-                "movq (%1, %%"FF_REG_a"), %%mm4 \n\t"
-                "movq 8(%1, %%"FF_REG_a"), %%mm5\n\t"
+                "movq     (%1, %0), %%mm0      \n\t"
+                "movq    8(%1, %0), %%mm1      \n\t"
+                "movq     (%3, %0), %%mm4      \n\t"
+                "movq    8(%3, %0), %%mm5      \n\t"
                 "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
                 "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
                 "pxor %%mm2, %%mm2              \n\t"
@@ -400,8 +400,8 @@ __asm__ volatile(
                 "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 
1)*q
                 "pxor %%mm4, %%mm4              \n\t"
                 "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 
: 0
-                "pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 
: 0
+                "pcmpeqw  (%1, %0), %%mm4      \n\t" // block[i] == 0 ? -1 : 0
+                "pcmpeqw 8(%1, %0), %%mm5      \n\t" // block[i] == 0 ? -1 : 0
                 "psrlw $5, %%mm0                \n\t"
                 "psrlw $5, %%mm1                \n\t"
                 "pxor %%mm2, %%mm0              \n\t"
@@ -412,12 +412,12 @@ __asm__ volatile(
                 "pandn %%mm1, %%mm5             \n\t"
                 "pxor %%mm4, %%mm7              \n\t"
                 "pxor %%mm5, %%mm7              \n\t"
-                "movq %%mm4, (%0, %%"FF_REG_a") \n\t"
-                "movq %%mm5, 8(%0, %%"FF_REG_a")\n\t"
+                "movq        %%mm4, (%1, %0)   \n\t"
+                "movq        %%mm5, 8(%1, %0)  \n\t"
 
-                "add $16, %%"FF_REG_a"          \n\t"
+                "add           $16, %0          \n\t"
                 "jng 1b                         \n\t"
-                "movd 124(%0, %3), %%mm0        \n\t"
+                "movd      124(%2), %%mm0      \n\t"
                 "movq %%mm7, %%mm6              \n\t"
                 "psrlq $32, %%mm7               \n\t"
                 "pxor %%mm6, %%mm7              \n\t"
@@ -427,10 +427,11 @@ __asm__ volatile(
                 "pslld $31, %%mm7               \n\t"
                 "psrlq $15, %%mm7               \n\t"
                 "pxor %%mm7, %%mm0              \n\t"
-                "movd %%mm0, 124(%0, %3)        \n\t"
+                "movd        %%mm0, 124(%2)    \n\t"
 
-                ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" 
(qscale), "r" (-2*nCoeffs)
-                : "%"FF_REG_a, "memory"
+                : "+r"(offset), "+r" (qscale2)
+                : "r" (block), "r"(quant_matrix)
+                : "memory"
         );
 }
 
-- 
2.49.1


>From 9d92daa446ae66d46634f53642a623120dd8a98f Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 4 Nov 2025 07:53:09 +0100
Subject: [PATCH 13/16] avcodec/x86/mpegvideo: Port
 dct_unquantize_h263_{intra,inter}_mmx to SSSE3

It benefits from wider registers and psignw.

Benchmarks:
dct_unquantize_h263_inter_c:                            88.3 ( 1.00x)
dct_unquantize_h263_inter_mmx:                          24.7 ( 3.58x)
dct_unquantize_h263_inter_ssse3:                         9.3 ( 9.47x)
dct_unquantize_h263_intra_c:                            93.7 ( 1.00x)
dct_unquantize_h263_intra_mmx:                          30.6 ( 3.06x)
dct_unquantize_h263_intra_ssse3:                        16.5 ( 5.69x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c | 146 ++++++++++++++++---------------------
 1 file changed, 62 insertions(+), 84 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index aa15e2b32a..82a29d1bcf 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -30,8 +30,13 @@
 
 #if HAVE_MMX_INLINE
 
-static void dct_unquantize_h263_intra_mmx(const MPVContext *s,
-                                          int16_t *block, int n, int qscale)
+#define SPLATW(reg) "punpcklwd    %%" #reg ", %%" #reg "\n\t" \
+                    "pshufd   $0, %%" #reg ", %%" #reg "\n\t"
+
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_h263_intra_ssse3(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     x86_reg qmul = (unsigned)qscale << 1;
     int level, qadd;
@@ -51,61 +56,45 @@ static void dct_unquantize_h263_intra_mmx(const MPVContext 
*s,
     x86_reg offset = s->ac_pred ? 63 << 1 : 
s->intra_scantable.raster_end[s->block_last_index[n]] << 1;
 
 __asm__ volatile(
-                "movd          %k1, %%mm6       \n\t" //qmul
-                "lea      (%2, %0), %1          \n\t"
-                "neg            %0              \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "movd           %3, %%mm5       \n\t" //qadd
-                "pxor %%mm7, %%mm7              \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "psubw %%mm5, %%mm7             \n\t"
-                "pxor %%mm4, %%mm4              \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%1, %0), %%mm0       \n\t"
-                "movq    8(%1, %0), %%mm1       \n\t"
+                "movd          %k1, %%xmm0     \n\t" //qmul
+                "lea      (%2, %0), %1         \n\t"
+                "neg            %0             \n\t"
+                "movd           %3, %%xmm1     \n\t" //qadd
+                SPLATW(xmm0)
+                SPLATW(xmm1)
 
-                "pmullw %%mm6, %%mm0            \n\t"
-                "pmullw %%mm6, %%mm1            \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%1, %0), %%xmm2     \n\t"
+                "movdqa 16(%1, %0), %%xmm3     \n\t"
 
-                "movq     (%1, %0), %%mm2       \n\t"
-                "movq    8(%1, %0), %%mm3       \n\t"
+                "movdqa     %%xmm1, %%xmm4     \n\t"
+                "movdqa     %%xmm1, %%xmm5     \n\t"
 
-                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
+                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd
 
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
+                "pmullw     %%xmm0, %%xmm2     \n\t"
+                "pmullw     %%xmm0, %%xmm3     \n\t"
 
-                "paddw %%mm7, %%mm0             \n\t"
-                "paddw %%mm7, %%mm1             \n\t"
+                "paddw      %%xmm4, %%xmm2     \n\t"
+                "paddw      %%xmm5, %%xmm3     \n\t"
 
-                "pxor %%mm0, %%mm2              \n\t"
-                "pxor %%mm1, %%mm3              \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
-
-                "pandn %%mm2, %%mm0             \n\t"
-                "pandn %%mm3, %%mm1             \n\t"
-
-                "movq        %%mm0, (%1, %0)    \n\t"
-                "movq        %%mm1, 8(%1, %0)   \n\t"
-
-                "add           $16, %0          \n\t"
-                "jng 1b                         \n\t"
+                "add           $32, %0         \n\t"
+                "jng            1b             \n\t"
                 : "+r"(offset), "+r"(qmul)
                 : "r" (block), "rm" (qadd)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
         block[0]= level;
 }
 
 
-static void dct_unquantize_h263_inter_mmx(const MPVContext *s,
-                                          int16_t *block, int n, int qscale)
+static void dct_unquantize_h263_inter_ssse3(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     int qmul = qscale << 1;
     int qadd = (qscale - 1) | 1;
@@ -115,56 +104,41 @@ static void dct_unquantize_h263_inter_mmx(const 
MPVContext *s,
     x86_reg offset = s->inter_scantable.raster_end[s->block_last_index[n]] << 
1;
 
 __asm__ volatile(
-                "movd           %2, %%mm6       \n\t" //qmul
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "movd           %3, %%mm5       \n\t" //qadd
-                "add            %1, %0          \n\t"
-                "neg            %1              \n\t"
-                "pxor %%mm7, %%mm7              \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "packssdw %%mm5, %%mm5          \n\t"
-                "psubw %%mm5, %%mm7             \n\t"
-                "pxor %%mm4, %%mm4              \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%0, %1), %%mm0       \n\t"
-                "movq    8(%0, %1), %%mm1       \n\t"
+                "movd           %2, %%xmm0     \n\t" //qmul
+                "movd           %3, %%xmm1     \n\t" //qadd
+                "add            %1, %0         \n\t"
+                "neg            %1             \n\t"
+                SPLATW(xmm0)
+                SPLATW(xmm1)
 
-                "pmullw %%mm6, %%mm0            \n\t"
-                "pmullw %%mm6, %%mm1            \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%0, %1), %%xmm2     \n\t"
+                "movdqa 16(%0, %1), %%xmm3     \n\t"
 
-                "movq     (%0, %1), %%mm2       \n\t"
-                "movq    8(%0, %1), %%mm3       \n\t"
+                "movdqa     %%xmm1, %%xmm4     \n\t"
+                "movdqa     %%xmm1, %%xmm5     \n\t"
 
-                "pcmpgtw %%mm4, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm4, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
+                "psignw     %%xmm2, %%xmm4     \n\t" // sgn(block[i])*qadd
+                "psignw     %%xmm3, %%xmm5     \n\t" // sgn(block[i])*qadd
 
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
+                "pmullw     %%xmm0, %%xmm2     \n\t"
+                "pmullw     %%xmm0, %%xmm3     \n\t"
 
-                "paddw %%mm7, %%mm0             \n\t"
-                "paddw %%mm7, %%mm1             \n\t"
+                "paddw      %%xmm4, %%xmm2     \n\t"
+                "paddw      %%xmm5, %%xmm3     \n\t"
 
-                "pxor %%mm0, %%mm2              \n\t"
-                "pxor %%mm1, %%mm3              \n\t"
+                "movdqa     %%xmm2, (%0, %1)   \n\t"
+                "movdqa     %%xmm3, 16(%0, %1) \n\t"
 
-                "pcmpeqw %%mm7, %%mm0           \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw %%mm7, %%mm1           \n\t" // block[i] == 0 ? -1 : 0
-
-                "pandn %%mm2, %%mm0             \n\t"
-                "pandn %%mm3, %%mm1             \n\t"
-
-                "movq        %%mm0, (%0, %1)    \n\t"
-                "movq        %%mm1, 8(%0, %1)   \n\t"
-
-                "add           $16, %1          \n\t"
-                "jng 1b                         \n\t"
+                "add           $32, %1         \n\t"
+                "jng 1b                        \n\t"
                 : "+r" (block), "+r" (offset)
                 : "rm"(qmul), "rm" (qadd)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
 }
+#endif
 
 static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
@@ -443,13 +417,17 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
-        s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
+#if HAVE_SSSE3_INLINE
+    if (INLINE_SSSE3(cpu_flags)) {
+        s->dct_unquantize_h263_intra  = dct_unquantize_h263_intra_ssse3;
+        s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
+    }
+#endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
 }
-- 
2.49.1


>From 34c34969187d900f041d5340ff1301643a1cad63 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Mon, 3 Nov 2025 19:45:49 +0100
Subject: [PATCH 14/16] avcodec/x86/mpegvideo: Port MPEG-1 unquantize functions
 to SSSE3

Benefits from wider registers and pabsw, psignw.

Benchmarks:
dct_unquantize_mpeg1_inter_c:                          343.0 ( 1.00x)
dct_unquantize_mpeg1_inter_mmx:                         50.6 ( 6.78x)
dct_unquantize_mpeg1_inter_ssse3:                       17.2 (19.94x)
dct_unquantize_mpeg1_intra_c:                          352.1 ( 1.00x)
dct_unquantize_mpeg1_intra_mmx:                         48.8 ( 7.22x)
dct_unquantize_mpeg1_intra_ssse3:                       19.5 (18.03x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/mpegvideo.h     |  10 ++-
 libavcodec/x86/mpegvideo.c | 171 ++++++++++++++++---------------------
 2 files changed, 78 insertions(+), 103 deletions(-)

diff --git a/libavcodec/mpegvideo.h b/libavcodec/mpegvideo.h
index 758bf57ab9..6aff5fbcd0 100644
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -38,6 +38,8 @@
 #include "qpeldsp.h"
 #include "videodsp.h"
 
+#include "libavutil/mem_internal.h"
+
 #define MAX_THREADS 32
 
 /**
@@ -202,10 +204,10 @@ typedef struct MpegEncContext {
     int *mb_index2xy;        ///< mb_index -> mb_x + mb_y*mb_stride
 
     /** matrix transmitted in the bitstream */
-    uint16_t intra_matrix[64];
-    uint16_t chroma_intra_matrix[64];
-    uint16_t inter_matrix[64];
-    uint16_t chroma_inter_matrix[64];
+    DECLARE_ALIGNED(16, uint16_t, intra_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, chroma_intra_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, inter_matrix)[64];
+    DECLARE_ALIGNED(16, uint16_t, chroma_inter_matrix)[64];
 
     /* error concealment / resync */
     int resync_mb_x;                 ///< x position of last resync marker
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 82a29d1bcf..01048df47d 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -138,10 +138,9 @@ __asm__ volatile(
                 : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5",) "memory"
         );
 }
-#endif
 
-static void dct_unquantize_mpeg1_intra_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_intra_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -159,59 +158,45 @@ static void dct_unquantize_mpeg1_intra_mmx(const 
MPVContext *s,
     quant_matrix = s->intra_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
-                "psraw $3, %%mm0                \n\t"
-                "psraw $3, %%mm1                \n\t"
-                "psubw %%mm7, %%mm0             \n\t"
-                "psubw %%mm7, %%mm1             \n\t"
-                "por %%mm7, %%mm0               \n\t"
-                "por %%mm7, %%mm1               \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%1, %0)           \n\t"
-                "movq %%mm5, 8(%1, %0)          \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrlw         $15, %%xmm7     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*q
+                "psraw          $3, %%xmm2     \n\t"
+                "psraw          $3, %%xmm3     \n\t"
+                "psubw      %%xmm7, %%xmm2     \n\t"
+                "psubw      %%xmm7, %%xmm3     \n\t"
+                "por        %%xmm7, %%xmm2     \n\t"
+                "por        %%xmm7, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "add $16, %0                    \n\t"
-                "js 1b                          \n\t"
+                "add           $32, %0         \n\t"
+                "js 1b                         \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
     block[0]= block0;
 }
 
-static void dct_unquantize_mpeg1_inter_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg1_inter_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -223,60 +208,48 @@ static void dct_unquantize_mpeg1_inter_mmx(const 
MPVContext *s,
         quant_matrix = s->inter_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlw $15, %%mm7               \n\t"
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
-                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
-                "paddw %%mm7, %%mm0             \n\t" // abs(block[i])*2 + 1
-                "paddw %%mm7, %%mm1             \n\t" // abs(block[i])*2 + 1
-                "pmullw %%mm4, %%mm0            \n\t" // (abs(block[i])*2 + 
1)*q
-                "pmullw %%mm5, %%mm1            \n\t" // (abs(block[i])*2 + 
1)*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw (%1, %0), %%mm4        \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5       \n\t" // block[i] == 0 ? -1 : 0
-                "psraw $4, %%mm0                \n\t"
-                "psraw $4, %%mm1                \n\t"
-                "psubw %%mm7, %%mm0             \n\t"
-                "psubw %%mm7, %%mm1             \n\t"
-                "por %%mm7, %%mm0               \n\t"
-                "por %%mm7, %%mm1               \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "movq %%mm4, (%1, %0)           \n\t"
-                "movq %%mm5, 8(%1, %0)          \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrlw         $15, %%xmm7     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
+                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
+                "paddw      %%xmm7, %%xmm2     \n\t" // abs(block[i])*2 + 1
+                "paddw      %%xmm7, %%xmm3     \n\t" // abs(block[i])*2 + 1
+                "pmullw     %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
+                "psraw          $4, %%xmm2     \n\t"
+                "psraw          $4, %%xmm3     \n\t"
+                "psubw      %%xmm7, %%xmm2     \n\t"
+                "psubw      %%xmm7, %%xmm3     \n\t"
+                "por        %%xmm7, %%xmm2     \n\t"
+                "por        %%xmm7, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
 
-                "add $16, %0                    \n\t"
-                "js 1b                          \n\t"
+                "add           $32, %0         \n\t"
+                "js 1b                         \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
 }
 
+#endif /* HAVE_SSSE3_INLINE */
+
 static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
                                            int16_t *block, int n, int qscale)
 {
@@ -417,8 +390,6 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
-        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
-        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
@@ -427,6 +398,8 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     if (INLINE_SSSE3(cpu_flags)) {
         s->dct_unquantize_h263_intra  = dct_unquantize_h263_intra_ssse3;
         s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
+        s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
+        s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
-- 
2.49.1


>From 1782943ccefe7b81bb9e35a017324276a8fcae82 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 4 Nov 2025 06:45:12 +0100
Subject: [PATCH 15/16] avcodec/x86/mpegvideo: Port
 dct_unquantize_mpeg2_inter_mmx to SSSE3

Benefits from wider registers, pabsw and psignw.

Benchmarks:
dct_unquantize_mpeg2_inter_c:                          131.2 ( 1.00x)
dct_unquantize_mpeg2_inter_mmx:                         50.2 ( 2.62x)
dct_unquantize_mpeg2_inter_ssse3:                       20.5 ( 6.38x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c | 109 +++++++++++++++++--------------------
 1 file changed, 49 insertions(+), 60 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 01048df47d..576f8f320f 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -305,8 +305,10 @@ __asm__ volatile(
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
 }
 
-static void dct_unquantize_mpeg2_inter_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+#if HAVE_SSSE3_INLINE
+
+static void dct_unquantize_mpeg2_inter_ssse3(const MPVContext *s,
+                                             int16_t *block, int n, int qscale)
 {
     av_assert2(s->block_last_index[n]>=0);
 
@@ -316,72 +318,59 @@ static void dct_unquantize_mpeg2_inter_mmx(const 
MPVContext *s,
 
 
 __asm__ volatile(
-                "movd          %k1, %%mm6      \n\t"
+                "movd          %k1, %%xmm6     \n\t"
                 "lea      (%2, %0), %1         \n\t"
                 "neg            %0             \n\t"
-                "pcmpeqw %%mm7, %%mm7           \n\t"
-                "psrlq $48, %%mm7               \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq     (%1, %0), %%mm0      \n\t"
-                "movq    8(%1, %0), %%mm1      \n\t"
-                "movq     (%3, %0), %%mm4      \n\t"
-                "movq    8(%3, %0), %%mm5      \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "pxor %%mm2, %%mm2              \n\t"
-                "pxor %%mm3, %%mm3              \n\t"
-                "pcmpgtw %%mm0, %%mm2           \n\t" // block[i] < 0 ? -1 : 0
-                "pcmpgtw %%mm1, %%mm3           \n\t" // block[i] < 0 ? -1 : 0
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t" // abs(block[i])
-                "psubw %%mm3, %%mm1             \n\t" // abs(block[i])
-                "paddw %%mm0, %%mm0             \n\t" // abs(block[i])*2
-                "paddw %%mm1, %%mm1             \n\t" // abs(block[i])*2
-                "pmullw %%mm4, %%mm0            \n\t" // abs(block[i])*2*q
-                "pmullw %%mm5, %%mm1            \n\t" // abs(block[i])*2*q
-                "paddw %%mm4, %%mm0             \n\t" // (abs(block[i])*2 + 
1)*q
-                "paddw %%mm5, %%mm1             \n\t" // (abs(block[i])*2 + 
1)*q
-                "pxor %%mm4, %%mm4              \n\t"
-                "pxor %%mm5, %%mm5              \n\t" // FIXME slow
-                "pcmpeqw  (%1, %0), %%mm4      \n\t" // block[i] == 0 ? -1 : 0
-                "pcmpeqw 8(%1, %0), %%mm5      \n\t" // block[i] == 0 ? -1 : 0
-                "psrlw $5, %%mm0                \n\t"
-                "psrlw $5, %%mm1                \n\t"
-                "pxor %%mm2, %%mm0              \n\t"
-                "pxor %%mm3, %%mm1              \n\t"
-                "psubw %%mm2, %%mm0             \n\t"
-                "psubw %%mm3, %%mm1             \n\t"
-                "pandn %%mm0, %%mm4             \n\t"
-                "pandn %%mm1, %%mm5             \n\t"
-                "pxor %%mm4, %%mm7              \n\t"
-                "pxor %%mm5, %%mm7              \n\t"
-                "movq        %%mm4, (%1, %0)   \n\t"
-                "movq        %%mm5, 8(%1, %0)  \n\t"
+                SPLATW(xmm6)
+                "pcmpeqw    %%xmm7, %%xmm7     \n\t"
+                "psrldq        $14, %%xmm7     \n\t"
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%3, %0), %%xmm4     \n\t"
+                "movdqa 16(%3, %0), %%xmm5     \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "pabsw      %%xmm0, %%xmm2     \n\t" // abs(block[i])
+                "pabsw      %%xmm1, %%xmm3     \n\t" // abs(block[i])
+                "paddw      %%xmm2, %%xmm2     \n\t" // abs(block[i])*2
+                "paddw      %%xmm3, %%xmm3     \n\t" // abs(block[i])*2
+                "pmullw     %%xmm4, %%xmm2     \n\t" // abs(block[i])*2*q
+                "pmullw     %%xmm5, %%xmm3     \n\t" // abs(block[i])*2*q
+                "paddw      %%xmm4, %%xmm2     \n\t" // (abs(block[i])*2 + 1)*q
+                "paddw      %%xmm5, %%xmm3     \n\t" // (abs(block[i])*2 + 1)*q
+                "psrlw          $5, %%xmm2     \n\t"
+                "psrlw          $5, %%xmm3     \n\t"
+                "psignw     %%xmm0, %%xmm2     \n\t"
+                "psignw     %%xmm1, %%xmm3     \n\t"
+                "movdqa     %%xmm2, (%1, %0)   \n\t"
+                "movdqa     %%xmm3, 16(%1, %0) \n\t"
+                "pxor       %%xmm2, %%xmm7     \n\t"
+                "pxor       %%xmm3, %%xmm7     \n\t"
 
-                "add           $16, %0          \n\t"
-                "jng 1b                         \n\t"
-                "movd      124(%2), %%mm0      \n\t"
-                "movq %%mm7, %%mm6              \n\t"
-                "psrlq $32, %%mm7               \n\t"
-                "pxor %%mm6, %%mm7              \n\t"
-                "movq %%mm7, %%mm6              \n\t"
-                "psrlq $16, %%mm7               \n\t"
-                "pxor %%mm6, %%mm7              \n\t"
-                "pslld $31, %%mm7               \n\t"
-                "psrlq $15, %%mm7               \n\t"
-                "pxor %%mm7, %%mm0              \n\t"
-                "movd        %%mm0, 124(%2)    \n\t"
+                "add           $32, %0         \n\t"
+                "jng 1b                        \n\t"
+                "movd      124(%2), %%xmm0     \n\t"
+                "movhlps    %%xmm7, %%xmm6     \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pshufd $1, %%xmm7, %%xmm6     \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pshuflw $1, %%xmm7, %%xmm6    \n\t"
+                "pxor       %%xmm6, %%xmm7     \n\t"
+                "pslld         $31, %%xmm7     \n\t"
+                "psrld         $15, %%xmm7     \n\t"
+                "pxor       %%xmm7, %%xmm0     \n\t"
+                "movd       %%xmm0, 124(%2)    \n\t"
 
                 : "+r"(offset), "+r" (qscale2)
                 : "r" (block), "r"(quant_matrix)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6", "%xmm7",)
+                  "memory"
         );
 }
 
+#endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
 
 av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
@@ -392,7 +381,6 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
     if (INLINE_MMX(cpu_flags)) {
         if (!bitexact)
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
-        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
     }
 #if HAVE_SSSE3_INLINE
     if (INLINE_SSSE3(cpu_flags)) {
@@ -400,6 +388,7 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
         s->dct_unquantize_h263_inter  = dct_unquantize_h263_inter_ssse3;
         s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_ssse3;
         s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_ssse3;
+        s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
 #endif /* HAVE_MMX_INLINE */
-- 
2.49.1


>From 8f82c18dbbf4248dbede522a978c6c82b78c1d18 Mon Sep 17 00:00:00 2001
From: Andreas Rheinhardt <[email protected]>
Date: Tue, 4 Nov 2025 06:48:19 +0100
Subject: [PATCH 16/16] avcodec/mpegvideo: Port dct_unquantize_mpeg2_intra_mmx
 to SSE2

Benefits from wider registers.

Benchmarks:
dct_unquantize_mpeg2_intra_c:                          228.2 ( 1.00x)
dct_unquantize_mpeg2_intra_mmx:                         28.2 ( 8.10x)
dct_unquantize_mpeg2_intra_sse2:                        18.4 (12.37x)

Signed-off-by: Andreas Rheinhardt <[email protected]>
---
 libavcodec/x86/mpegvideo.c            | 68 +++++++++++++--------------
 tests/checkasm/mpegvideo_unquantize.c |  2 +-
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 576f8f320f..7c137cf75e 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -28,7 +28,7 @@
 #include "libavcodec/mpegvideodata.h"
 #include "libavcodec/mpegvideo_unquantize.h"
 
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
 
 #define SPLATW(reg) "punpcklwd    %%" #reg ", %%" #reg "\n\t" \
                     "pshufd   $0, %%" #reg ", %%" #reg "\n\t"
@@ -250,8 +250,8 @@ __asm__ volatile(
 
 #endif /* HAVE_SSSE3_INLINE */
 
-static void dct_unquantize_mpeg2_intra_mmx(const MPVContext *s,
-                                           int16_t *block, int n, int qscale)
+static void dct_unquantize_mpeg2_intra_sse2(const MPVContext *s,
+                                            int16_t *block, int n, int qscale)
 {
     x86_reg nCoeffs;
     const uint16_t *quant_matrix;
@@ -271,35 +271,35 @@ static void dct_unquantize_mpeg2_intra_mmx(const 
MPVContext *s,
     quant_matrix = s->intra_matrix;
     x86_reg offset = -2 * nCoeffs;
 __asm__ volatile(
-                "movd %3, %%mm6                 \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                "packssdw %%mm6, %%mm6          \n\t"
-                ".p2align 4                     \n\t"
-                "1:                             \n\t"
-                "movq (%1, %0), %%mm0           \n\t"
-                "movq 8(%1, %0), %%mm1          \n\t"
-                "movq (%2, %0), %%mm4           \n\t"
-                "movq 8(%2, %0), %%mm5          \n\t"
-                "pmullw %%mm6, %%mm4            \n\t" // 
q=qscale*quant_matrix[i]
-                "pmullw %%mm6, %%mm5            \n\t" // 
q=qscale*quant_matrix[i]
-                "movq %%mm0, %%mm2              \n\t"
-                "movq %%mm1, %%mm3              \n\t"
-                "psrlw $12, %%mm2               \n\t" // block[i] < 0 ? 0xf : 0
-                "psrlw $12, %%mm3               \n\t" // (block[i] is in the 
-2048..2047 range)
-                "pmullw %%mm4, %%mm0            \n\t" // block[i]*q
-                "pmullw %%mm5, %%mm1            \n\t" // block[i]*q
-                "paddw %%mm2, %%mm0             \n\t" // bias negative block[i]
-                "paddw %%mm3, %%mm1             \n\t" // so that a right-shift
-                "psraw $4, %%mm0                \n\t" // is equivalent to 
divide
-                "psraw $4, %%mm1                \n\t" // with rounding towards 
zero
-                "movq %%mm0, (%1, %0)           \n\t"
-                "movq %%mm1, 8(%1, %0)          \n\t"
+                "movd           %3, %%xmm6     \n\t"
+                SPLATW(xmm6)
+                ".p2align 4                    \n\t"
+                "1:                            \n\t"
+                "movdqa   (%1, %0), %%xmm0     \n\t"
+                "movdqa 16(%1, %0), %%xmm1     \n\t"
+                "movdqa   (%2, %0), %%xmm4     \n\t"
+                "movdqa 16(%2, %0), %%xmm5     \n\t"
+                "pmullw     %%xmm6, %%xmm4     \n\t" // 
q=qscale*quant_matrix[i]
+                "pmullw     %%xmm6, %%xmm5     \n\t" // 
q=qscale*quant_matrix[i]
+                "movdqa     %%xmm0, %%xmm2     \n\t"
+                "movdqa     %%xmm1, %%xmm3     \n\t"
+                "psrlw         $12, %%xmm2     \n\t" // block[i] < 0 ? 0xf : 0
+                "psrlw         $12, %%xmm3     \n\t" // (block[i] is in the 
-2048..2047 range)
+                "pmullw     %%xmm4, %%xmm0     \n\t" // block[i]*q
+                "pmullw     %%xmm5, %%xmm1     \n\t" // block[i]*q
+                "paddw      %%xmm2, %%xmm0     \n\t" // bias negative block[i]
+                "paddw      %%xmm3, %%xmm1     \n\t" // so that a right-shift
+                "psraw          $4, %%xmm0     \n\t" // is equivalent to divide
+                "psraw          $4, %%xmm1     \n\t" // with rounding towards 
zero
+                "movdqa     %%xmm0, (%1, %0)   \n\t"
+                "movdqa     %%xmm1, 16(%1, %0) \n\t"
 
-                "add $16, %0                    \n\t"
-                "jng 1b                         \n\t"
+                "add           $32, %0         \n\t"
+                "jng 1b                        \n\t"
                 : "+r" (offset)
                 : "r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale)
-                : "memory"
+                : XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", 
"%xmm5", "%xmm6",)
+                  "memory"
         );
     block[0]= block0;
         //Note, we do not do mismatch control for intra as errors cannot 
accumulate
@@ -371,16 +371,16 @@ __asm__ volatile(
 }
 
 #endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 
 av_cold void ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
 {
-#if HAVE_MMX_INLINE
+#if HAVE_SSE2_INLINE
     int cpu_flags = av_get_cpu_flags();
 
-    if (INLINE_MMX(cpu_flags)) {
+    if (INLINE_SSE2(cpu_flags)) {
         if (!bitexact)
-            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
+            s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_sse2;
     }
 #if HAVE_SSSE3_INLINE
     if (INLINE_SSSE3(cpu_flags)) {
@@ -391,5 +391,5 @@ av_cold void 
ff_mpv_unquantize_init_x86(MPVUnquantDSPContext *s, int bitexact)
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_ssse3;
     }
 #endif /* HAVE_SSSE3_INLINE */
-#endif /* HAVE_MMX_INLINE */
+#endif /* HAVE_SSE2_INLINE */
 }
diff --git a/tests/checkasm/mpegvideo_unquantize.c 
b/tests/checkasm/mpegvideo_unquantize.c
index 837606e60e..220a743a96 100644
--- a/tests/checkasm/mpegvideo_unquantize.c
+++ b/tests/checkasm/mpegvideo_unquantize.c
@@ -215,7 +215,7 @@ void checkasm_check_mpegvideo_unquantize(void)
     int q_scale_type = rnd() & 1;
 
     ff_mpv_unquantize_init(&unquant_dsp_ctx, 1 /* bitexact */, q_scale_type);
-    declare_func_emms(AV_CPU_FLAG_MMX, void, MPVContext *s, int16_t *block, 
int n, int qscale);
+    declare_func(void, MPVContext *s, int16_t *block, int n, int qscale);
 
     for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
         void (*func)(MPVContext *s, int16_t *block, int n, int qscale) =
-- 
2.49.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH] Port mpegvideo unquantize to SSE2/SSSE3 (PR #21049)

Reply via email to