rnd_template: Merge into hpeldsp_init.c

ffmpeg-git--- via ffmpeg-cvslog Thu, 25 Sep 2025 21:38:10 -0700

The branch, master has been updated
       via  a54d6b1d91ba17f6e1316997dd5f0ced4cee8ee5 (commit)
       via  43fe9554cc7998fbe0bae455c7b374e76a4d253f (commit)
       via  00e046df132fd1751c50798334b985beec89661f (commit)
       via  30c4007c65e1f73de5ce1b5eb459c71e0b21389a (commit)
       via  1e677e696488d52068e83c669ae871caa7c34583 (commit)
       via  262791b8d8c7a5e3df44c8784de192857e67d52f (commit)
       via  c7161befb4ae7d0f40e35676f52507e7de1c8b01 (commit)
       via  4fc05c28f426d6073e6e15db334b0c88ff925f1d (commit)
       via  5ef613bcb0508f16bd5b190168183326391de9b0 (commit)
       via  6a47ea5f9fdaedd6aa4bc8723c86a0c7a30d8ed1 (commit)
       via  918d37d9d156f15b63952a22bfba0541dd087129 (commit)
       via  e86f137514fb8a69cf145f26c83b1b053c727b52 (commit)
       via  2cf9e733c6a666600423a0967f23341d9f09e3c8 (commit)
       via  1f9ef6a8dc6e57b360cf53dd644fde1936ad3047 (commit)
       via  8a7858dacf50797c7b81aad119e8811a849d0552 (commit)
       via  4d691da5edb360fa043df8ce267a382cfcdaf07a (commit)
       via  4e2ef29cbaa258cb73f06e62435198736e493a10 (commit)
       via  fcb9e0b5f019ec46dffb6d769793ccb7d884fb14 (commit)
       via  89f2016ece77868cc1982ae104d56b25aaf519c3 (commit)
       via  b316a1bdd122ca1bcb43b20dbd6bc9c244f98cfe (commit)
      from  baace56169a8cea7b44d727bdf656110aace011d (commit)



- Log -----------------------------------------------------------------
commit a54d6b1d91ba17f6e1316997dd5f0ced4cee8ee5
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 05:01:41 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:58 2025 +0200

    avcodec/x86/rnd_template: Merge into hpeldsp_init.c
    
    It is now only included exactly once.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 66ed886ea9..cb47cb7752 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -33,6 +33,7 @@
 #include "libavcodec/pixels.h"
 #include "fpel.h"
 #include "hpeldsp.h"
+#include "inline_asm.h"
 
 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
@@ -73,15 +74,74 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t 
*pixels,
 
 /***********************************/
 /* MMX no rounding */
-#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
-#define SET_RND  MOVQ_WONE
-#define STATIC static
 
-#include "rnd_template.c"
+// put_pixels
+static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+                                       ptrdiff_t line_size, int h)
+{
+    MOVQ_ZERO(mm7);
+    MOVQ_WONE(mm6); // =1 for no_rnd version
+    __asm__ volatile(
+        "movq   (%1), %%mm0             \n\t"
+        "movq   1(%1), %%mm4            \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
+        "add    %3, %1                  \n\t"
+        ".p2align 3                     \n\t"
+        "1:                             \n\t"
+        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
+        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+        "movq   %%mm0, %%mm1            \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "punpcklbw %%mm7, %%mm0         \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpckhbw %%mm7, %%mm1         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "paddusw %%mm2, %%mm0           \n\t"
+        "paddusw %%mm3, %%mm1           \n\t"
+        "paddusw %%mm6, %%mm4           \n\t"
+        "paddusw %%mm6, %%mm5           \n\t"
+        "paddusw %%mm0, %%mm4           \n\t"
+        "paddusw %%mm1, %%mm5           \n\t"
+        "psrlw  $2, %%mm4               \n\t"
+        "psrlw  $2, %%mm5               \n\t"
+        "packuswb  %%mm5, %%mm4         \n\t"
+        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"           \n\t"
+
+        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
+        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+        "movq   %%mm2, %%mm3            \n\t"
+        "movq   %%mm4, %%mm5            \n\t"
+        "punpcklbw %%mm7, %%mm2         \n\t"
+        "punpcklbw %%mm7, %%mm4         \n\t"
+        "punpckhbw %%mm7, %%mm3         \n\t"
+        "punpckhbw %%mm7, %%mm5         \n\t"
+        "paddusw %%mm2, %%mm4           \n\t"
+        "paddusw %%mm3, %%mm5           \n\t"
+        "paddusw %%mm6, %%mm0           \n\t"
+        "paddusw %%mm6, %%mm1           \n\t"
+        "paddusw %%mm4, %%mm0           \n\t"
+        "paddusw %%mm5, %%mm1           \n\t"
+        "psrlw  $2, %%mm0               \n\t"
+        "psrlw  $2, %%mm1               \n\t"
+        "packuswb  %%mm1, %%mm0         \n\t"
+        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
+        "add    %3, %%"FF_REG_a"        \n\t"
 
-#undef DEF
-#undef SET_RND
-#undef STATIC
+        "subl   $2, %0                  \n\t"
+        "jnz    1b                      \n\t"
+        :"+g"(h), "+S"(pixels)
+        :"D"(block), "r"((x86_reg)line_size)
+        :FF_REG_a, "memory");
+}
 
 // this routine is 'slightly' suboptimal but mostly unused
 static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
deleted file mode 100644
index 4590aeddf0..0000000000
--- a/libavcodec/x86/rnd_template.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2003-2004 Michael Niedermayer <[email protected]>
- *
- * MMX optimization by Nick Kurshev <[email protected]>
- * mostly rewritten by Michael Niedermayer <[email protected]>
- * and improved by Zdenek Kabelac <[email protected]>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "inline_asm.h"
-
-// put_pixels
-av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t 
*pixels,
-                                  ptrdiff_t line_size, int h)
-{
-    MOVQ_ZERO(mm7);
-    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
-    __asm__ volatile(
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm4            \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
-        "add    %3, %1                  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
-        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
-        "movq   %%mm0, %%mm1            \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "punpcklbw %%mm7, %%mm0         \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpckhbw %%mm7, %%mm1         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "paddusw %%mm2, %%mm0           \n\t"
-        "paddusw %%mm3, %%mm1           \n\t"
-        "paddusw %%mm6, %%mm4           \n\t"
-        "paddusw %%mm6, %%mm5           \n\t"
-        "paddusw %%mm0, %%mm4           \n\t"
-        "paddusw %%mm1, %%mm5           \n\t"
-        "psrlw  $2, %%mm4               \n\t"
-        "psrlw  $2, %%mm5               \n\t"
-        "packuswb  %%mm5, %%mm4         \n\t"
-        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"           \n\t"
-
-        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
-        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
-        "movq   %%mm2, %%mm3            \n\t"
-        "movq   %%mm4, %%mm5            \n\t"
-        "punpcklbw %%mm7, %%mm2         \n\t"
-        "punpcklbw %%mm7, %%mm4         \n\t"
-        "punpckhbw %%mm7, %%mm3         \n\t"
-        "punpckhbw %%mm7, %%mm5         \n\t"
-        "paddusw %%mm2, %%mm4           \n\t"
-        "paddusw %%mm3, %%mm5           \n\t"
-        "paddusw %%mm6, %%mm0           \n\t"
-        "paddusw %%mm6, %%mm1           \n\t"
-        "paddusw %%mm4, %%mm0           \n\t"
-        "paddusw %%mm5, %%mm1           \n\t"
-        "psrlw  $2, %%mm0               \n\t"
-        "psrlw  $2, %%mm1               \n\t"
-        "packuswb  %%mm1, %%mm0         \n\t"
-        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
-        "add    %3, %%"FF_REG_a"        \n\t"
-
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels)
-        :"D"(block), "r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}

commit 43fe9554cc7998fbe0bae455c7b374e76a4d253f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:49:44 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:55 2025 +0200

    avcodec/x86/hpeldsp_init: Avoid complicating macro
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 48a1aa7a2c..66ed886ea9 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -69,8 +69,6 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t 
*pixels,
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 
-#define put_no_rnd_pixels8_mmx  ff_put_pixels8_mmx
-
 #if HAVE_INLINE_ASM
 
 /***********************************/
@@ -167,25 +165,16 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, 
put_no_rnd_pixels8_xy2_mmx, 8)
 #endif
 #endif /* HAVE_INLINE_ASM */
 
-#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
-    if (HAVE_MMX_EXTERNAL)                                                  \
-        c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU
-
-#define SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU)                                  
 \
-    do {                                                                       
 \
-        SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU);                               
 \
-        c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## 
CPU; \
-    } while (0)
-
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMX_INLINE
     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
+    c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
     c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
+#endif
 #if HAVE_MMX_EXTERNAL
+    c->put_no_rnd_pixels_tab[1][0] =
     c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
-#endif
-    SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
 #endif
 }
 

commit 00e046df132fd1751c50798334b985beec89661f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:32:55 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:53 2025 +0200

    avcodec/x86/hpeldsp_init: Remove MMX(EXT) funcs overridden by SSE2
    
    This affects the {avg,put}_no_rnd_pixels16_{x,y}2 MMX and
    (put-only) MMXEXT versions. Removing these functions saved
    1184B here.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 4f369c9731..48a1aa7a2c 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -161,167 +161,12 @@ static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, 
const uint8_t *pixels,
         :FF_REG_a, "memory");
 }
 
-static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "movq   8(%1), %%mm0            \n\t"
-        "movq   9(%1), %%mm1            \n\t"
-        "movq   8(%1, %3), %%mm2        \n\t"
-        "movq   9(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, 8(%2)            \n\t"
-        "movq   %%mm5, 8(%2, %3)        \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"FF_REG_a"     \n\t"
-        "movq (%1), %%mm0               \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm2\n\t"
-        PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"),%%mm0\n\t"
-        PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
-static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-        __asm__ volatile(
-            ".p2align 3                 \n\t"
-            "1:                         \n\t"
-            "movq  (%1), %%mm0          \n\t"
-            "movq  1(%1), %%mm1         \n\t"
-            "movq  (%2), %%mm3          \n\t"
-            PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, (%2)          \n\t"
-            "movq  8(%1), %%mm0         \n\t"
-            "movq  9(%1), %%mm1         \n\t"
-            "movq  8(%2), %%mm3         \n\t"
-            PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
-            PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
-            "movq  %%mm0, 8(%2)         \n\t"
-            "add    %3, %1              \n\t"
-            "add    %3, %2              \n\t"
-            "subl   $1, %0              \n\t"
-            "jnz    1b                  \n\t"
-            :"+g"(h), "+S"(pixels), "+D"(block)
-            :"r"((x86_reg)line_size)
-            :"memory");
-}
-
-static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm2 \n\t"
-        PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4,   %%mm2, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm0, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-
-        "movq   (%1, %3), %%mm1         \n\t"
-        "movq   (%1, %%"FF_REG_a"), %%mm0 \n\t"
-        PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4,   %%mm0, %%mm1, %%mm5)
-        "movq   (%2), %%mm3             \n\t"
-        PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
-        "movq   (%2, %3), %%mm3         \n\t"
-        PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
-        "movq   %%mm2, (%2)             \n\t"
-        "movq   %%mm1, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
 #if HAVE_MMX
-CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
-
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #endif
 #endif /* HAVE_INLINE_ASM */
 
-
-#if HAVE_X86ASM
-
-#define HPELDSP_AVG_PIXELS16(CPUEXT)                      \
-    CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 
## CPUEXT, 8) \
-    CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 
## CPUEXT, 8)
-
-HPELDSP_AVG_PIXELS16(_mmxext)
-
-#endif /* HAVE_X86ASM */
-
 #define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU)                             \
     if (HAVE_MMX_EXTERNAL)                                                  \
         c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU
@@ -331,18 +176,11 @@ HPELDSP_AVG_PIXELS16(_mmxext)
         SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU);                               
 \
         c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## 
CPU; \
     } while (0)
-#define SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU)                                  
 \
-    do {                                                                       
 \
-        c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## 
CPU; \
-        c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## 
CPU; \
-    } while (0)
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMX_INLINE
-    SET_HPEL_FUNCS12(put_no_rnd, [0], 16, mmx);
     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
-    SET_HPEL_FUNCS12(avg_no_rnd,  , 16, mmx);
     c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
 #if HAVE_MMX_EXTERNAL
     c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
@@ -365,8 +203,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int 
flags)
     c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
 
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
-        c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
-        c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
     }

commit 30c4007c65e1f73de5ce1b5eb459c71e0b21389a
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:15:22 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:49 2025 +0200

    avcodec/x86/hpeldsp: Add SSE2 avg_no_rnd size 16 versions
    
    These currently only exist as MMX versions.
    The added functions occupy 320B here. So far, they are only for
    the x2 and y2 (i.e. right and down, not down-right) directions.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 522a349e21..e9f988f7b5 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -125,12 +125,12 @@ cglobal put_no_rnd_pixels8_x2, 4,5
     RET
 
 
-%macro NO_RND_PIXELS_X2 0
+%macro NO_RND_PIXELS_X2 1
 %if cpuflag(sse2)
-cglobal put_no_rnd_pixels16_x2, 4,5,5
+cglobal %1_no_rnd_pixels16_x2, 4,5,5
 %else
 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-cglobal put_no_rnd_pixels8_x2_exact, 4,5
+cglobal %1_no_rnd_pixels8_x2_exact, 4,5
 %endif
     lea          r4, [r2*3]
     pcmpeqb      m4, m4
@@ -147,6 +147,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
     PAVGB        m2, m3
     pxor         m0, m4
     pxor         m2, m4
+%ifidn %1, avg
+    pavgb        m0, [r0]
+    pavgb        m2, [r0+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m2
     movu         m0, [r1+r2*2]
@@ -161,6 +165,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
     PAVGB        m2, m3
     pxor         m0, m4
     pxor         m2, m4
+%ifidn %1, avg
+    pavgb        m0, [r0+r2*2]
+    pavgb        m2, [r0+r4]
+%endif
     mova  [r0+r2*2], m0
     mova    [r0+r4], m2
     lea          r1, [r1+r2*4]
@@ -171,9 +179,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
 %endmacro
 
 INIT_MMX mmxext
-NO_RND_PIXELS_X2
+NO_RND_PIXELS_X2 put
 INIT_XMM sse2
-NO_RND_PIXELS_X2
+NO_RND_PIXELS_X2 avg
+NO_RND_PIXELS_X2 put
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro PUT_PIXELS8_Y2 0
@@ -245,12 +254,12 @@ cglobal put_no_rnd_pixels8_y2, 4,5
     RET
 
 
-%macro NO_RND_PIXELS_Y2 0
+%macro NO_RND_PIXELS_Y2 1
 %if cpuflag(sse2)
-cglobal put_no_rnd_pixels16_y2, 4,5,4
+cglobal %1_no_rnd_pixels16_y2, 4,5,4
 %else
 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-cglobal put_no_rnd_pixels8_y2_exact, 4,5
+cglobal %1_no_rnd_pixels8_y2_exact, 4,5
 %endif
     lea          r4, [r2*3]
     movu         m0, [r1]
@@ -266,6 +275,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
     PAVGB        m1, m2
     pxor         m0, m3
     pxor         m1, m3
+%ifidn %1, avg
+    pavgb        m0, [r0]
+    pavgb        m1, [r0+r2]
+%endif
     mova       [r0], m0
     mova    [r0+r2], m1
     movu         m1, [r1+r2*2]
@@ -276,6 +289,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
     PAVGB        m1, m0
     pxor         m2, m3
     pxor         m1, m3
+%ifidn %1, avg
+    pavgb        m2,[r0+r2*2]
+    pavgb        m1,[r0+r4]
+%endif
     mova  [r0+r2*2], m2
     mova    [r0+r4], m1
     lea          r1, [r1+r2*4]
@@ -286,9 +303,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
 %endmacro
 
 INIT_MMX mmxext
-NO_RND_PIXELS_Y2
+NO_RND_PIXELS_Y2 put
 INIT_XMM sse2
-NO_RND_PIXELS_Y2
+NO_RND_PIXELS_Y2 avg
+NO_RND_PIXELS_Y2 put
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro AVG_PIXELS8_X2 0
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c8ccd7b011..4f369c9731 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -51,6 +51,8 @@ void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
                                            ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -60,6 +62,8 @@ void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
                                            ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -385,7 +389,10 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
     c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
     c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;
     c->avg_pixels_tab[0][3]        = ff_avg_pixels16_xy2_sse2;
+
     c->avg_no_rnd_pixels_tab[0]    = ff_avg_pixels16_sse2;
+    c->avg_no_rnd_pixels_tab[1]    = ff_avg_no_rnd_pixels16_x2_sse2;
+    c->avg_no_rnd_pixels_tab[2]    = ff_avg_no_rnd_pixels16_y2_sse2;
 #endif /* HAVE_SSE2_EXTERNAL */
 }
 

commit 1e677e696488d52068e83c669ae871caa7c34583
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 03:52:28 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:46 2025 +0200

    avcodec/x86/hpeldsp: Add SSE2 put_no_rnd size 16 versions
    
    These currently only exist as MMX and (not bitexact) MMXEXT versions.
    The added functions occupy 288B here. So far, they are only for
    the x2 and y2 (i.e. right and down, not down-right) directions.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 859894856d..522a349e21 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -125,38 +125,42 @@ cglobal put_no_rnd_pixels8_x2, 4,5
     RET
 
 
+%macro NO_RND_PIXELS_X2 0
+%if cpuflag(sse2)
+cglobal put_no_rnd_pixels16_x2, 4,5,5
+%else
 ; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-INIT_MMX mmxext
 cglobal put_no_rnd_pixels8_x2_exact, 4,5
+%endif
     lea          r4, [r2*3]
-    pcmpeqb      m6, m6
+    pcmpeqb      m4, m4
 .loop:
-    mova         m0, [r1]
-    mova         m2, [r1+r2]
-    mova         m1, [r1+1]
-    mova         m3, [r1+r2+1]
-    pxor         m0, m6
-    pxor         m2, m6
-    pxor         m1, m6
-    pxor         m3, m6
+    movu         m0, [r1]
+    movu         m2, [r1+r2]
+    movu         m1, [r1+1]
+    movu         m3, [r1+r2+1]
+    pxor         m0, m4
+    pxor         m2, m4
+    pxor         m1, m4
+    pxor         m3, m4
     PAVGB        m0, m1
     PAVGB        m2, m3
-    pxor         m0, m6
-    pxor         m2, m6
+    pxor         m0, m4
+    pxor         m2, m4
     mova       [r0], m0
     mova    [r0+r2], m2
-    mova         m0, [r1+r2*2]
-    mova         m1, [r1+r2*2+1]
-    mova         m2, [r1+r4]
-    mova         m3, [r1+r4+1]
-    pxor         m0, m6
-    pxor         m1, m6
-    pxor         m2, m6
-    pxor         m3, m6
+    movu         m0, [r1+r2*2]
+    movu         m1, [r1+r2*2+1]
+    movu         m2, [r1+r4]
+    movu         m3, [r1+r4+1]
+    pxor         m0, m4
+    pxor         m1, m4
+    pxor         m2, m4
+    pxor         m3, m4
     PAVGB        m0, m1
     PAVGB        m2, m3
-    pxor         m0, m6
-    pxor         m2, m6
+    pxor         m0, m4
+    pxor         m2, m4
     mova  [r0+r2*2], m0
     mova    [r0+r4], m2
     lea          r1, [r1+r2*4]
@@ -164,7 +168,12 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
     sub         r3d, 4
     jg .loop
     RET
+%endmacro
 
+INIT_MMX mmxext
+NO_RND_PIXELS_X2
+INIT_XMM sse2
+NO_RND_PIXELS_X2
 
 ; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro PUT_PIXELS8_Y2 0
@@ -236,33 +245,37 @@ cglobal put_no_rnd_pixels8_y2, 4,5
     RET
 
 
+%macro NO_RND_PIXELS_Y2 0
+%if cpuflag(sse2)
+cglobal put_no_rnd_pixels16_y2, 4,5,4
+%else
 ; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-INIT_MMX mmxext
 cglobal put_no_rnd_pixels8_y2_exact, 4,5
+%endif
     lea          r4, [r2*3]
-    mova         m0, [r1]
-    pcmpeqb      m6, m6
+    movu         m0, [r1]
+    pcmpeqb      m3, m3
     add          r1, r2
-    pxor         m0, m6
+    pxor         m0, m3
 .loop:
-    mova         m1, [r1]
-    mova         m2, [r1+r2]
-    pxor         m1, m6
-    pxor         m2, m6
+    movu         m1, [r1]
+    movu         m2, [r1+r2]
+    pxor         m1, m3
+    pxor         m2, m3
     PAVGB        m0, m1
     PAVGB        m1, m2
-    pxor         m0, m6
-    pxor         m1, m6
+    pxor         m0, m3
+    pxor         m1, m3
     mova       [r0], m0
     mova    [r0+r2], m1
-    mova         m1, [r1+r2*2]
-    mova         m0, [r1+r4]
-    pxor         m1, m6
-    pxor         m0, m6
+    movu         m1, [r1+r2*2]
+    movu         m0, [r1+r4]
+    pxor         m1, m3
+    pxor         m0, m3
     PAVGB        m2, m1
     PAVGB        m1, m0
-    pxor         m2, m6
-    pxor         m1, m6
+    pxor         m2, m3
+    pxor         m1, m3
     mova  [r0+r2*2], m2
     mova    [r0+r4], m1
     lea          r1, [r1+r2*4]
@@ -270,7 +283,12 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
     sub         r3d, 4
     jg .loop
     RET
+%endmacro
 
+INIT_MMX mmxext
+NO_RND_PIXELS_Y2
+INIT_XMM sse2
+NO_RND_PIXELS_Y2
 
 ; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro AVG_PIXELS8_X2 0
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index ab32b825c9..c8ccd7b011 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -49,6 +49,8 @@ void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const 
uint8_t *pixels,
 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
                                            const uint8_t *pixels,
                                            ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -56,6 +58,8 @@ void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const 
uint8_t *pixels,
 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
                                            const uint8_t *pixels,
                                            ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+                                    ptrdiff_t line_size, int h);
 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -369,10 +373,14 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int 
flags)
 {
 #if HAVE_SSE2_EXTERNAL
     c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
-    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
     c->put_pixels_tab[0][1]        = ff_put_pixels16_x2_sse2;
     c->put_pixels_tab[0][2]        = ff_put_pixels16_y2_sse2;
     c->put_pixels_tab[0][3]        = ff_put_pixels16_xy2_sse2;
+
+    c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+    c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
+    c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
+
     c->avg_pixels_tab[0][0]        = ff_avg_pixels16_sse2;
     c->avg_pixels_tab[0][1]        = ff_avg_pixels16_x2_sse2;
     c->avg_pixels_tab[0][2]        = ff_avg_pixels16_y2_sse2;

commit 262791b8d8c7a5e3df44c8784de192857e67d52f
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 05:41:04 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:43 2025 +0200

    avcodec/hpeldsp: Make put_no_rnd_pixels_tab smaller
    
    Only the blocksizes 16 and 8 are implemented, yet the motion estimation
    code touches the blocksize 4 entries. But really nothing touches
    the blocksize 2 entries, so that we can reduce the put_no_rnd_pixels_tab
    array size to [3][4].
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 1f6a165bf6..6c9fdce0c1 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -77,10 +77,10 @@ typedef struct HpelDSPContext {
      * @param pixels source
      * @param line_size number of bytes in a horizontal line of block
      * @param h height
-     * @note The size is kept at [4][4] to match the above pixel_tabs and avoid
-     *       out of bounds reads in the motion estimation code.
+     * @note The size is kept at [3][4] to avoid out of bounds accesses
+     *       in the motion estimation code.
      */
-    op_pixels_func put_no_rnd_pixels_tab[4][4];
+    op_pixels_func put_no_rnd_pixels_tab[3][4];
 
     /**
      * Halfpel motion compensation with no rounding (a+b)>>1.

commit c7161befb4ae7d0f40e35676f52507e7de1c8b01
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 15:12:49 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:39 2025 +0200

    avcodec/x86/h264_qpel: Remove MMX(EXT) funcs overridden by SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 versions.
    This commit therefore removes the MMX(EXT) functions overridden
    by them (which don't abide by the ABI) to get closer to a removal
    of emms_c.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 69ffd001e0..18d80a52f6 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -46,12 +46,10 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t 
*src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-#define ff_put_pixels8_mmxext(...)
 #define ff_put_pixels4_mmxext(...)
 
 #define DEF_QPEL(OPNAME)\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t 
*src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t 
*src, int dstStride, int srcStride);\
 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t 
*src, int dstStride, int srcStride);\
 void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const 
uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
 void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const 
uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
@@ -91,15 +89,6 @@ static av_always_inline void ff_ ## OPNAME ## 
h264_qpel8or16_hv2_lowpass_ ## MMX
     }while(w--);\
 }\
 \
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## 
MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
-    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, 
srcStride);\
-    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, 
srcStride);\
-    src += 8*srcStride;\
-    dst += 8*dstStride;\
-    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst  , src  , dstStride, 
srcStride);\
-    ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, 
srcStride);\
-}\
-\
 static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## 
MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int 
src2Stride){\
     ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst  , src  , src2  , 
dstStride, src2Stride);\
     ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, 
dstStride, src2Stride);\
@@ -196,10 +185,6 @@ static av_always_inline void ff_ ## OPNAME ## 
h264_qpel16_hv_lowpass_ ## MMX(uin
 #define ff_put_h264_qpel8or16_hv2_lowpass_sse2 
ff_put_h264_qpel8or16_hv2_lowpass_mmxext
 #define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 
ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
 
-#define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-
 #define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
@@ -356,8 +341,7 @@ QPEL_H264_HV_XMM(put_,       PUT_OP, ssse3)
 QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 
 H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
-H264_MC(H264_MC_C_H, 8, mmxext, 8)
-H264_MC(H264_MC_H, 16, mmxext, 8)
+H264_MC_C(avg_, 8, mmxext, 8)
 H264_MC_816(H264_MC_V, sse2)
 H264_MC_816(H264_MC_HV, sse2)
 H264_MC_816(H264_MC_H, ssse3)
@@ -421,20 +405,11 @@ LUMA_MC_816(10, mc33, sse2)
 
 #endif /* HAVE_X86ASM */
 
-#define SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX)                       \
+#define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX)                      \
     do {                                                                     \
     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
-    } while (0)
-#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX)                      \
-    do {                                                                     \
-    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
-    SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX);                          \
-    } while (0)
-#define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX)                      \
-    do {                                                                     \
-    SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX);                          \
     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
@@ -478,11 +453,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int 
bit_depth)
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         if (!high_bit_depth) {
-            SET_QPEL_FUNCS123 (put_h264_qpel, 0, 16, mmxext, );
-            SET_QPEL_FUNCS123 (put_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS_1PP(put_h264_qpel, 2,  4, mmxext, );
-            SET_QPEL_FUNCS123 (avg_h264_qpel, 0, 16, mmxext, );
-            SET_QPEL_FUNCS0123(avg_h264_qpel, 1,  8, mmxext, );
+            c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_mmxext;
             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
         } else if (bit_depth == 10) {
             SET_QPEL_FUNCS(put_h264_qpel, 2, 4,  10_mmxext, ff_);
diff --git a/libavcodec/x86/h264_qpel_8bit.asm 
b/libavcodec/x86/h264_qpel_8bit.asm
index 4e64329991..89e7c282b2 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -96,66 +96,6 @@ INIT_MMX mmxext
 QPEL4_H_LOWPASS_OP put
 QPEL4_H_LOWPASS_OP avg
 
-%macro QPEL8_H_LOWPASS_OP 1
-cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
-    movsxdifnidn  r2, r2d
-    movsxdifnidn  r3, r3d
-    mov          r4d, 8
-    pxor          m7, m7
-    mova          m6, [pw_5]
-.loop:
-    mova          m0, [r1]
-    mova          m2, [r1+1]
-    mova          m1, m0
-    mova          m3, m2
-    punpcklbw     m0, m7
-    punpckhbw     m1, m7
-    punpcklbw     m2, m7
-    punpckhbw     m3, m7
-    paddw         m0, m2
-    paddw         m1, m3
-    psllw         m0, 2
-    psllw         m1, 2
-    mova          m2, [r1-1]
-    mova          m4, [r1+2]
-    mova          m3, m2
-    mova          m5, m4
-    punpcklbw     m2, m7
-    punpckhbw     m3, m7
-    punpcklbw     m4, m7
-    punpckhbw     m5, m7
-    paddw         m2, m4
-    paddw         m5, m3
-    psubw         m0, m2
-    psubw         m1, m5
-    pmullw        m0, m6
-    pmullw        m1, m6
-    movd          m2, [r1-2]
-    movd          m5, [r1+7]
-    punpcklbw     m2, m7
-    punpcklbw     m5, m7
-    paddw         m2, m3
-    paddw         m4, m5
-    mova          m5, [pw_16]
-    paddw         m2, m5
-    paddw         m4, m5
-    paddw         m0, m2
-    paddw         m1, m4
-    psraw         m0, 5
-    psraw         m1, 5
-    packuswb      m0, m1
-    op_%1         m0, [r0], m4
-    add           r0, r2
-    add           r1, r3
-    dec          r4d
-    jg         .loop
-    RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8_H_LOWPASS_OP put
-QPEL8_H_LOWPASS_OP avg
-
 %macro QPEL8_H_LOWPASS_OP_XMM 1
 cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
     movsxdifnidn  r2, r2d

commit 4fc05c28f426d6073e6e15db334b0c88ff925f1d
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 13:12:31 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:35 2025 +0200

    avfilter/x86/vf_gradfun: Remove MMXEXT func overridden by SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 version
    of filter_line.
    This commit therefore removes the overridden MMXEXT version
    (which didn't abide by the ABI) which allows us to remove
    an emms_c() from vf_gradfun.c, so that users with SSSE3 no longer
    pay a price for the mere existence of an MMXEXT version.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c
index 088b3c9143..4f211c3ddf 100644
--- a/libavfilter/vf_gradfun.c
+++ b/libavfilter/vf_gradfun.c
@@ -32,7 +32,6 @@
  * Dither it back to 8bit.
  */
 
-#include "libavutil/emms.h"
 #include "libavutil/imgutils.h"
 #include "libavutil/common.h"
 #include "libavutil/mem.h"
@@ -119,7 +118,6 @@ static void filter(GradFunContext *ctx, uint8_t *dst, const 
uint8_t *src, int wi
         ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc - 
r / 2, width, thresh, dither[y & 7]);
         if (++y >= height) break;
     }
-    emms_c();
 }
 
 static av_cold int init(AVFilterContext *ctx)
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index d106d52100..55e7c1ea0f 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -27,7 +27,15 @@ pw_ff: times 8 dw 0xFF
 
 SECTION .text
 
-%macro FILTER_LINE 1
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+    movd       m5, r4d
+    pxor       m7, m7
+    pshuflw    m5, m5, 0
+    mova       m6, [pw_7f]
+    punpcklqdq m5, m5
+    mova       m4, [r5]
+.loop:
     movh       m0, [r2+r0]
     movh       m1, [r3+r0]
     punpcklbw  m0, m7
@@ -40,42 +48,12 @@ SECTION .text
     pminsw     m2, m7
     pmullw     m2, m2
     psllw      m1, 2
-    paddw      m0, %1
+    paddw      m0, m4
     pmulhw     m1, m2
     paddw      m0, m1
     psraw      m0, 7
     packuswb   m0, m0
     movh  [r1+r0], m0
-%endmacro
-
-INIT_MMX mmxext
-cglobal gradfun_filter_line, 6, 6
-    movh      m5, r4d
-    pxor      m7, m7
-    pshufw    m5, m5,0
-    mova      m6, [pw_7f]
-    mova      m3, [r5]
-    mova      m4, [r5+8]
-.loop:
-    FILTER_LINE m3
-    add       r0, 4
-    jge .end
-    FILTER_LINE m4
-    add       r0, 4
-    jl .loop
-.end:
-    RET
-
-INIT_XMM ssse3
-cglobal gradfun_filter_line, 6, 6, 8
-    movd       m5, r4d
-    pxor       m7, m7
-    pshuflw    m5, m5, 0
-    mova       m6, [pw_7f]
-    punpcklqdq m5, m5
-    mova       m4, [r5]
-.loop:
-    FILTER_LINE m4
     add        r0, 8
     jl .loop
     RET
diff --git a/libavfilter/x86/vf_gradfun_init.c 
b/libavfilter/x86/vf_gradfun_init.c
index 56e6774a79..f262f0a1bb 100644
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
@@ -24,9 +24,6 @@
 #include "libavutil/x86/cpu.h"
 #include "libavfilter/gradfun.h"
 
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t 
*src,
-                                   const uint16_t *dc, int thresh,
-                                   const uint16_t *dithers);
 void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
                                   const uint16_t *dc, int thresh,
                                   const uint16_t *dithers);
@@ -39,23 +36,6 @@ void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t 
*buf,
                                       const uint8_t *src1, const uint8_t 
*src2);
 
 #if HAVE_X86ASM
-static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src,
-                                       const uint16_t *dc,
-                                       int width, int thresh,
-                                       const uint16_t *dithers)
-{
-    intptr_t x;
-    if (width & 3) {
-        x = width & ~3;
-        ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
-                                 width - x, thresh, dithers);
-        width = x;
-    }
-    x = -width;
-    ff_gradfun_filter_line_mmxext(x, dst + width, src + width, dc + width / 2,
-                                  thresh, dithers);
-}
-
 static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const 
uint16_t *dc,
                                       int width, int thresh,
                                       const uint16_t *dithers)
@@ -93,8 +73,6 @@ av_cold void ff_gradfun_init_x86(GradFunContext *gf)
 #if HAVE_X86ASM
     int cpu_flags = av_get_cpu_flags();
 
-    if (EXTERNAL_MMXEXT(cpu_flags))
-        gf->filter_line = gradfun_filter_line_mmxext;
     if (EXTERNAL_SSSE3(cpu_flags))
         gf->filter_line = gradfun_filter_line_ssse3;
 

commit 5ef613bcb0508f16bd5b190168183326391de9b0
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 06:22:05 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:31 2025 +0200

    avcodec/x86/mpegvideoencdsp_init: Remove MMX, 3DNOw funcs overridden by 
SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 versions.
    This commit therefore removes the MMX and 3DNOW functions overridden
    by them (which don't abide by the ABI) to get closer to a removal
    of emms_c.
    
    Also merge the mpegvideoenc_qns_template.c file into the main file.
    
    The 3DNOW functions removed in this commit were the last in the
    codebase.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c 
b/libavcodec/x86/mpegvideoenc_qns_template.c
deleted file mode 100644
index 0d6454f45f..0000000000
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
- * Copyright (c) 2004 Michael Niedermayer
- *
- * MMX optimization by Michael Niedermayer <[email protected]>
- * 3DNow! and SSSE3 optimization by Zuxy Meng <[email protected]>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/avassert.h"
-#include "libavutil/common.h"
-#include "libavutil/x86/asm.h"
-
-#include "inline_asm.h"
-
-#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
-
-static int DEF(try_8x8basis)(const int16_t rem[64], const int16_t weight[64], 
const int16_t basis[64], int scale)
-{
-    x86_reg i=0;
-
-    av_assert2(FFABS(scale) < MAX_ABS);
-    scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-
-    SET_RND(mm6);
-    __asm__ volatile(
-        "pxor %%mm7, %%mm7              \n\t"
-        "movd  %4, %%mm5                \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        "punpcklwd %%mm5, %%mm5         \n\t"
-        ".p2align 4                     \n\t"
-        "1:                             \n\t"
-        "movq  (%1, %0), %%mm0          \n\t"
-        "movq  8(%1, %0), %%mm1         \n\t"
-        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-        "paddw (%2, %0), %%mm0          \n\t"
-        "paddw 8(%2, %0), %%mm1         \n\t"
-        "psraw $6, %%mm0                \n\t"
-        "psraw $6, %%mm1                \n\t"
-        "pmullw (%3, %0), %%mm0         \n\t"
-        "pmullw 8(%3, %0), %%mm1        \n\t"
-        "pmaddwd %%mm0, %%mm0           \n\t"
-        "pmaddwd %%mm1, %%mm1           \n\t"
-        "paddd %%mm1, %%mm0             \n\t"
-        "psrld $4, %%mm0                \n\t"
-        "paddd %%mm0, %%mm7             \n\t"
-        "add $16, %0                    \n\t"
-        "cmp $128, %0                   \n\t" //FIXME optimize & bench
-        " jb 1b                         \n\t"
-        PHADDD(%%mm7, %%mm6)
-        "psrld $2, %%mm7                \n\t"
-        "movd %%mm7, %0                 \n\t"
-
-        : "+r" (i)
-        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
-    );
-    return i;
-}
-
-static void DEF(add_8x8basis)(int16_t rem[64], const int16_t basis[64], int 
scale)
-{
-    x86_reg i=0;
-
-    if(FFABS(scale) < MAX_ABS){
-        scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-        SET_RND(mm6);
-        __asm__ volatile(
-                "movd  %3, %%mm5        \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                "punpcklwd %%mm5, %%mm5 \n\t"
-                ".p2align 4             \n\t"
-                "1:                     \n\t"
-                "movq  (%1, %0), %%mm0  \n\t"
-                "movq  8(%1, %0), %%mm1 \n\t"
-                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
-                "paddw (%2, %0), %%mm0  \n\t"
-                "paddw 8(%2, %0), %%mm1 \n\t"
-                "movq %%mm0, (%2, %0)   \n\t"
-                "movq %%mm1, 8(%2, %0)  \n\t"
-                "add $16, %0            \n\t"
-                "cmp $128, %0           \n\t" // FIXME optimize & bench
-                " jb 1b                 \n\t"
-
-                : "+r" (i)
-                : "r"(basis), "r"(rem), "g"(scale)
-        );
-    }else{
-        for(i=0; i<8*8; i++){
-            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - 
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
-        }
-    }
-}
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c 
b/libavcodec/x86/mpegvideoencdsp_init.c
index d39091a5c9..78c2ef87b8 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -16,9 +16,13 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <stdint.h>
+
 #include "libavutil/attributes.h"
 #include "libavutil/avassert.h"
+#include "libavutil/common.h"
 #include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
 #include "libavutil/x86/cpu.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mpegvideoencdsp.h"
@@ -28,71 +32,93 @@ int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t 
line_size);
 int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
 
 #if HAVE_INLINE_ASM
-
-#define PHADDD(a, t)                            \
-    "movq  " #a ", " #t "               \n\t"   \
-    "psrlq    $32, " #a "               \n\t"   \
-    "paddd " #t ", " #a "               \n\t"
+#if HAVE_SSSE3_INLINE
+#define SCALE_OFFSET -1
 
 /*
- * pmulhw:   dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
- * pmulhrw:  dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
  * pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
  */
-#define PMULHRW(x, y, s, o)                     \
-    "pmulhw " #s ", " #x "              \n\t"   \
-    "pmulhw " #s ", " #y "              \n\t"   \
-    "paddw  " #o ", " #x "              \n\t"   \
-    "paddw  " #o ", " #y "              \n\t"   \
-    "psraw      $1, " #x "              \n\t"   \
-    "psraw      $1, " #y "              \n\t"
-#define DEF(x) x ## _mmx
-#define SET_RND MOVQ_WONE
-#define SCALE_OFFSET 1
-
-#include "mpegvideoenc_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#define DEF(x) x ## _3dnow
-#define SET_RND(x)
-#define SCALE_OFFSET 0
-#define PMULHRW(x, y, s, o)                     \
-    "pmulhrw " #s ", " #x "             \n\t"   \
-    "pmulhrw " #s ", " #y "             \n\t"
-
-#include "mpegvideoenc_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#if HAVE_SSSE3_INLINE
-#undef PHADDD
-#define DEF(x) x ## _ssse3
-#define SET_RND(x)
-#define SCALE_OFFSET -1
-
-#define PHADDD(a, t)                            \
-    "pshufw $0x0E, " #a ", " #t "       \n\t"   \
-    /* faster than phaddd on core2 */           \
-    "paddd " #t ", " #a "               \n\t"
-
 #define PMULHRW(x, y, s, o)                     \
     "pmulhrsw " #s ", " #x "            \n\t"   \
     "pmulhrsw " #s ", " #y "            \n\t"
 
-#include "mpegvideoenc_qns_template.c"
+#define MAX_ABS 512
+
+static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], 
const int16_t basis[64], int scale)
+{
+    x86_reg i=0;
+
+    av_assert2(FFABS(scale) < MAX_ABS);
+    scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+    __asm__ volatile(
+        "pxor %%mm7, %%mm7              \n\t"
+        "movd  %4, %%mm5                \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        "punpcklwd %%mm5, %%mm5         \n\t"
+        ".p2align 4                     \n\t"
+        "1:                             \n\t"
+        "movq  (%1, %0), %%mm0          \n\t"
+        "movq  8(%1, %0), %%mm1         \n\t"
+        PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+        "paddw (%2, %0), %%mm0          \n\t"
+        "paddw 8(%2, %0), %%mm1         \n\t"
+        "psraw $6, %%mm0                \n\t"
+        "psraw $6, %%mm1                \n\t"
+        "pmullw (%3, %0), %%mm0         \n\t"
+        "pmullw 8(%3, %0), %%mm1        \n\t"
+        "pmaddwd %%mm0, %%mm0           \n\t"
+        "pmaddwd %%mm1, %%mm1           \n\t"
+        "paddd %%mm1, %%mm0             \n\t"
+        "psrld $4, %%mm0                \n\t"
+        "paddd %%mm0, %%mm7             \n\t"
+        "add $16, %0                    \n\t"
+        "cmp $128, %0                   \n\t" //FIXME optimize & bench
+        " jb 1b                         \n\t"
+        "pshufw $0x0E, %%mm7, %%mm6     \n\t"
+        "paddd %%mm6, %%mm7             \n\t" // faster than phaddd on core2
+        "psrld $2, %%mm7                \n\t"
+        "movd %%mm7, %0                 \n\t"
+
+        : "+r" (i)
+        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+    );
+    return i;
+}
+
+static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int 
scale)
+{
+    x86_reg i=0;
+
+    if (FFABS(scale) < MAX_ABS) {
+        scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+        __asm__ volatile(
+                "movd  %3, %%mm5        \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                "punpcklwd %%mm5, %%mm5 \n\t"
+                ".p2align 4             \n\t"
+                "1:                     \n\t"
+                "movq  (%1, %0), %%mm0  \n\t"
+                "movq  8(%1, %0), %%mm1 \n\t"
+                PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+                "paddw (%2, %0), %%mm0  \n\t"
+                "paddw 8(%2, %0), %%mm1 \n\t"
+                "movq %%mm0, (%2, %0)   \n\t"
+                "movq %%mm1, 8(%2, %0)  \n\t"
+                "add $16, %0            \n\t"
+                "cmp $128, %0           \n\t" // FIXME optimize & bench
+                " jb 1b                 \n\t"
+
+                : "+r" (i)
+                : "r"(basis), "r"(rem), "g"(scale)
+        );
+    } else {
+        for (i=0; i<8*8; i++) {
+            rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - 
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+        }
+    }
+}
 
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-#undef PHADDD
 #endif /* HAVE_SSSE3_INLINE */
 
 /* Draw the edges of width 'w' of an image of size width, height */
@@ -197,23 +223,11 @@ av_cold void 
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
 #if HAVE_INLINE_ASM
 
     if (INLINE_MMX(cpu_flags)) {
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->try_8x8basis = try_8x8basis_mmx;
-        }
-        c->add_8x8basis = add_8x8basis_mmx;
-
         if (avctx->bits_per_raw_sample <= 8) {
             c->draw_edges = draw_edges_mmx;
         }
     }
 
-    if (INLINE_AMD3DNOW(cpu_flags)) {
-        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
-            c->try_8x8basis = try_8x8basis_3dnow;
-        }
-        c->add_8x8basis = add_8x8basis_3dnow;
-    }
-
 #if HAVE_SSSE3_INLINE
     if (INLINE_SSSE3(cpu_flags)) {
         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {

commit 6a47ea5f9fdaedd6aa4bc8723c86a0c7a30d8ed1
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 16:17:53 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:26 2025 +0200

    avcodec/x86/vvc/sao_10bit: Remove unused functions
    
    Saves 65280B here.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/vvc/sao_10bit.asm b/libavcodec/x86/vvc/sao_10bit.asm
index b7d3d08008..ccf14a34a4 100644
--- a/libavcodec/x86/vvc/sao_10bit.asm
+++ b/libavcodec/x86/vvc/sao_10bit.asm
@@ -28,28 +28,6 @@
     H2656_SAO_BAND_FILTER vvc, %1, %2, %3
 %endmacro
 
-%macro VVC_SAO_BAND_FILTER_FUNCS 1
-    VVC_SAO_BAND_FILTER %1,   8,  1
-    VVC_SAO_BAND_FILTER %1,  16,  2
-    VVC_SAO_BAND_FILTER %1,  32,  4
-    VVC_SAO_BAND_FILTER %1,  48,  6
-    VVC_SAO_BAND_FILTER %1,  64,  8
-    VVC_SAO_BAND_FILTER %1,  80, 10
-    VVC_SAO_BAND_FILTER %1,  96, 12
-    VVC_SAO_BAND_FILTER %1, 112, 14
-    VVC_SAO_BAND_FILTER %1, 128, 16
-%endmacro
-
-%macro VVC_SAO_BAND_FILTER_FUNCS 0
-    VVC_SAO_BAND_FILTER_FUNCS 10
-    VVC_SAO_BAND_FILTER_FUNCS 12
-%endmacro
-
-INIT_XMM sse2
-VVC_SAO_BAND_FILTER_FUNCS
-INIT_XMM avx
-VVC_SAO_BAND_FILTER_FUNCS
-
 %if HAVE_AVX2_EXTERNAL
 
 %macro VVC_SAO_BAND_FILTER_FUNCS_AVX2 1
@@ -75,22 +53,6 @@ VVC_SAO_BAND_FILTER_FUNCS_AVX2 12
     H2656_SAO_EDGE_FILTER vvc, %1, %2, %3
 %endmacro
 
-%macro VVC_SAO_EDGE_FILTER_FUNCS 1
-    VVC_SAO_EDGE_FILTER %1,   8,  1
-    VVC_SAO_EDGE_FILTER %1,  16,  2
-    VVC_SAO_EDGE_FILTER %1,  32,  4
-    VVC_SAO_EDGE_FILTER %1,  48,  6
-    VVC_SAO_EDGE_FILTER %1,  64,  8
-    VVC_SAO_EDGE_FILTER %1,  80, 10
-    VVC_SAO_EDGE_FILTER %1,  96, 12
-    VVC_SAO_EDGE_FILTER %1, 112, 14
-    VVC_SAO_EDGE_FILTER %1, 128, 16
-%endmacro
-
-INIT_XMM sse2
-VVC_SAO_EDGE_FILTER_FUNCS 10
-VVC_SAO_EDGE_FILTER_FUNCS 12
-
 %if HAVE_AVX2_EXTERNAL
 
 %macro VVC_SAO_EDGE_FILTER_FUNCS_AVX2 1

commit 918d37d9d156f15b63952a22bfba0541dd087129
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 05:55:07 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:23 2025 +0200

    avcodec/x86/rv40dsp_init: Remove MMX(EXT) funcs overridden by SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 versions.
    This commit therefore removes the MMX(EXT) functions overridden
    by them (which don't abide by the ABI) to get closer to a removal
    of emms_c.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 9d8b58f929..859894856d 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -372,11 +372,7 @@ AVG_PIXELS8_Y2
 
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro SET_PIXELS_XY2 1
-%if cpuflag(sse2)
 cglobal %1_pixels16_xy2, 4,5,8
-%else
-cglobal %1_pixels8_xy2, 4,5
-%endif
     pxor        m7, m7
     mova        m6, [pw_2]
     movu        m0, [r1]
@@ -448,8 +444,6 @@ cglobal %1_pixels8_xy2, 4,5
     RET
 %endmacro
 
-INIT_MMX mmxext
-SET_PIXELS_XY2 avg
 INIT_XMM sse2
 SET_PIXELS_XY2 put
 SET_PIXELS_XY2 avg
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index ac7e625fda..8208e43ac1 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -25,22 +25,14 @@
 void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
 
-void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                            ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 
-void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                             ptrdiff_t line_size, int h);
 void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h);
 
-void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
-                            ptrdiff_t line_size, int h);
 void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
 void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 7ee2db1358..ab32b825c9 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -301,20 +301,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, 
put_no_rnd_pixels8_y2_mmx, 8)
 CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
 CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
 #endif
-
-/***********************************/
-/* MMX rounding */
-
-#define SET_RND  MOVQ_WTWO
-#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
-#define STATIC
-
-#include "rnd_template.c"
-
-#undef NO_AVG
-#undef DEF
-#undef SET_RND
-
 #endif /* HAVE_INLINE_ASM */
 
 
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index ab9e644c60..780358abc2 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -174,34 +174,22 @@ DEFINE_FN(put, 8, ssse3)
 DEFINE_FN(put, 16, sse2)
 DEFINE_FN(put, 16, ssse3)
 
-DEFINE_FN(avg, 8, mmxext)
 DEFINE_FN(avg, 8, ssse3)
 
 DEFINE_FN(avg, 16, sse2)
 DEFINE_FN(avg, 16, ssse3)
 #endif /* HAVE_X86ASM */
 
-#if HAVE_MMX_INLINE
-DEFINE_FN(put, 8, mmx)
-#endif
-
 av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
 {
     av_unused int cpu_flags = av_get_cpu_flags();
 
-#if HAVE_MMX_INLINE
-    if (INLINE_MMX(cpu_flags)) {
-        c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
-    }
-#endif /* HAVE_MMX_INLINE */
-
 #if HAVE_X86ASM
     if (EXTERNAL_MMX(cpu_flags)) {
         c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
         c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
     }
     if (EXTERNAL_MMXEXT(cpu_flags)) {
-        c->avg_pixels_tab[1][15]        = avg_rv40_qpel8_mc33_mmxext;
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
     }

commit e86f137514fb8a69cf145f26c83b1b053c727b52
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 05:28:17 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:19 2025 +0200

    avcodec/x86/hpeldsp_init: Remove MMX(EXT) funcs overridden by SSSE3
    
    SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
    so that the overwhelming majority of our users (particularly those
    that actually update their FFmpeg) will be using the SSSE3 versions.
    This commit therefore removes the MMX(EXT) functions overridden
    by them (which don't abide by the ABI) to get closer to a removal
    of emms_c.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index b59195de95..9d8b58f929 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -370,46 +370,6 @@ INIT_XMM sse2
 AVG_PIXELS8_Y2
 
 
-; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
-; Note this is not correctly rounded, and is therefore used for
-; not-bitexact output
-INIT_MMX mmxext
-cglobal avg_approx_pixels8_xy2, 4,5
-    mova         m6, [pb_1]
-    lea          r4, [r2*2]
-    mova         m0, [r1]
-    PAVGB        m0, [r1+1]
-.loop:
-    mova         m2, [r1+r4]
-    mova         m1, [r1+r2]
-    psubusb      m2, m6
-    PAVGB        m1, [r1+r2+1]
-    PAVGB        m2, [r1+r4+1]
-    add          r1, r4
-    PAVGB        m0, m1
-    PAVGB        m1, m2
-    PAVGB        m0, [r0]
-    PAVGB        m1, [r0+r2]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova         m1, [r1+r2]
-    mova         m0, [r1+r4]
-    PAVGB        m1, [r1+r2+1]
-    PAVGB        m0, [r1+r4+1]
-    add          r0, r4
-    add          r1, r4
-    PAVGB        m2, m1
-    PAVGB        m1, m0
-    PAVGB        m2, [r0]
-    PAVGB        m1, [r0+r2]
-    mova       [r0], m2
-    mova    [r0+r2], m1
-    add          r0, r4
-    sub         r3d, 4
-    jne .loop
-    RET
-
-
 ; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
 %macro SET_PIXELS_XY2 1
 %if cpuflag(sse2)
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c0913552d5..7ee2db1358 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -60,11 +60,7 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t 
*pixels,
                               ptrdiff_t line_size, int h);
 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
-void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
-                                      ptrdiff_t line_size, int h);
 
-#define put_pixels8_mmx         ff_put_pixels8_mmx
-#define put_pixels8_xy2_mmx     ff_put_pixels8_xy2_mmx
 #define put_no_rnd_pixels8_mmx  ff_put_pixels8_mmx
 
 #if HAVE_INLINE_ASM
@@ -354,7 +350,9 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
     c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
     SET_HPEL_FUNCS12(avg_no_rnd,  , 16, mmx);
     c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
-    SET_HPEL_FUNCS03(put,      [1],  8, mmx);
+#if HAVE_MMX_EXTERNAL
+    c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
+#endif
     SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
 #endif
 }
@@ -368,7 +366,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int 
flags)
     c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
     c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
     c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
-    c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
 
     c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
     c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
@@ -378,8 +375,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int 
flags)
         c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
-
-        c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }

commit 2cf9e733c6a666600423a0967f23341d9f09e3c8
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 02:08:03 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:16 2025 +0200

    avcodec/x86/qpeldsp_init: Use SSE2 versions where possible
    
    The mc00 versions (i.e. the qdsp functions with no subpixel
    interpolation) are just wrappers around their fpel versions.
    There are SSE2 versions of these, yet the qpel code only
    uses the MMX(EXT) versions. This commit changes this and
    also removes the MMX(EXT) versions.
    
    This also allowed to remove ff_avg_pixels16_mmxext,
    ff_put_pixels16_mmx.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b07b789074..8551ff1ff3 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -67,12 +67,10 @@ cglobal %1_pixels%2, 4,5,4
 
 INIT_MMX mmx
 OP_PIXELS put, 8
-OP_PIXELS put, 16
 
 INIT_MMX mmxext
 OP_PIXELS avg, 4
 OP_PIXELS avg, 8
-OP_PIXELS avg, 16
 
 INIT_XMM sse2
 OP_PIXELS put, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 47ffc8eec7..851a70b99f 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -26,14 +26,10 @@ void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t 
*pixels,
                            ptrdiff_t line_size, int h);
 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
                            ptrdiff_t line_size, int h);
-void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
-                            ptrdiff_t line_size, int h);
 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
 void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
                         ptrdiff_t line_size, int h);
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
-                         ptrdiff_t line_size, int h);
 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
                           ptrdiff_t line_size, int h);
 
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index 3b05e156cc..097cda0106 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -79,22 +79,10 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, 
const uint8_t *src,
 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
                                                 const uint8_t *src,
                                                 int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
 
 #if HAVE_X86ASM
 
-#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
-#define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
-
 #define QPEL_OP(OPNAME, RND, MMX)                                       \
-static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst,                  \
-                                         const uint8_t *src,            \
-                                         ptrdiff_t stride)              \
-{                                                                       \
-    ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
-}                                                                       \
-                                                                        \
 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst,                  \
                                          const uint8_t *src,            \
                                          ptrdiff_t stride)              \
@@ -291,13 +279,6 @@ static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst,     
             \
                                                    stride, 8);          \
 }                                                                       \
                                                                         \
-static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst,                 \
-                                          const uint8_t *src,           \
-                                          ptrdiff_t stride)             \
-{                                                                       \
-    ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
-}                                                                       \
-                                                                        \
 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst,                 \
                                           const uint8_t *src,           \
                                           ptrdiff_t stride)             \
@@ -504,11 +485,23 @@ QPEL_OP(put_,        _,        mmxext)
 QPEL_OP(avg_,        _,        mmxext)
 QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
 
+#define MC00(OPNAME, SIZE, EXT)                                         \
+static void OPNAME ## _qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst,      \
+                                                     const uint8_t *src,\
+                                                     ptrdiff_t stride)  \
+{                                                                       \
+    ff_ ## OPNAME ## _pixels ## SIZE ##_ ## EXT(dst, src, stride, SIZE);\
+}
+
+MC00(put,  8, mmx)
+MC00(avg,  8, mmxext)
+MC00(put, 16, sse2)
+MC00(avg, 16, sse2)
+
 #endif /* HAVE_X86ASM */
 
 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
 do {                                                                         \
-    c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
@@ -533,12 +526,20 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
     if (X86_MMXEXT(cpu_flags)) {
 #if HAVE_MMXEXT_EXTERNAL
         SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
+        c->avg_qpel_pixels_tab[1][0] = avg_qpel8_mc00_mmxext;
         SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
 
         SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
+        c->put_no_rnd_qpel_pixels_tab[1][0] =
+        c->put_qpel_pixels_tab[1][0] = put_qpel8_mc00_mmx;
         SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
         SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
         SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 #endif /* HAVE_MMXEXT_EXTERNAL */
     }
+    if (EXTERNAL_SSE2(cpu_flags)) {
+        c->put_no_rnd_qpel_pixels_tab[0][0] =
+        c->put_qpel_pixels_tab[0][0] = put_qpel16_mc00_sse2;
+        c->avg_qpel_pixels_tab[0][0] = avg_qpel16_mc00_sse2;
+    }
 }

commit 1f9ef6a8dc6e57b360cf53dd644fde1936ad3047
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 01:18:54 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:12 2025 +0200

    avcodec/x86/h264_qpel: Remove MMX(EXT) functions overridden by SSE2FAST
    
    CPUs which support SSE2, but not in a fast way (so that
    they get the additional AV_CPU_FLAG_SSE2SLOW) are ancient
    nowadays (2007 and older), so ignore the distinction between
    the two and remove MMX and MMXEXT functions that are now
    overridden by SSE2 functions.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index d69ccda89c..69ffd001e0 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -46,7 +46,6 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t 
*src1, const uint8_t
 #define ff_avg_pixels8_l2_sse2  ff_avg_pixels8_l2_mmxext
 #define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
 #define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-#define ff_put_pixels16_mmxext  ff_put_pixels16_mmx
 #define ff_put_pixels8_mmxext(...)
 #define ff_put_pixels4_mmxext(...)
 
@@ -217,7 +216,6 @@ static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const 
uint8_t *src,
 {
     ff_avg_pixels16_sse2(dst, src, stride, 16);
 }
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
 
 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
 static void av_unused OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t 
*dst, const uint8_t *src, ptrdiff_t stride)\
@@ -359,7 +357,7 @@ QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 
 H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
 H264_MC(H264_MC_C_H, 8, mmxext, 8)
-H264_MC(H264_MC_C_H, 16, mmxext, 8)
+H264_MC(H264_MC_H, 16, mmxext, 8)
 H264_MC_816(H264_MC_V, sse2)
 H264_MC_816(H264_MC_HV, sse2)
 H264_MC_816(H264_MC_H, ssse3)
@@ -480,10 +478,10 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int 
bit_depth)
 
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         if (!high_bit_depth) {
-            SET_QPEL_FUNCS0123(put_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS123 (put_h264_qpel, 0, 16, mmxext, );
             SET_QPEL_FUNCS123 (put_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS_1PP(put_h264_qpel, 2,  4, mmxext, );
-            SET_QPEL_FUNCS0123(avg_h264_qpel, 0, 16, mmxext, );
+            SET_QPEL_FUNCS123 (avg_h264_qpel, 0, 16, mmxext, );
             SET_QPEL_FUNCS0123(avg_h264_qpel, 1,  8, mmxext, );
             SET_QPEL_FUNCS(avg_h264_qpel, 2,  4, mmxext, );
         } else if (bit_depth == 10) {
@@ -506,6 +504,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int 
bit_depth)
             H264_QPEL_FUNCS(3, 1, sse2);
             H264_QPEL_FUNCS(3, 2, sse2);
             H264_QPEL_FUNCS(3, 3, sse2);
+            c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2;
+            c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2;
         }
 
         if (bit_depth == 10) {
@@ -519,14 +519,6 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int 
bit_depth)
         }
     }
 
-    if (EXTERNAL_SSE2_FAST(cpu_flags)) {
-        if (!high_bit_depth) {
-            c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2;
-            c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2;
-            c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_sse2;
-        }
-    }
-
     if (EXTERNAL_SSSE3(cpu_flags)) {
         if (!high_bit_depth) {
             H264_QPEL_FUNCS(1, 0, ssse3);

commit 8a7858dacf50797c7b81aad119e8811a849d0552
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 00:26:32 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:08 2025 +0200

    avcodec/x86/hpeldsp_init: Remove MMX(EXT) functions overridden by SSE2FAST
    
    CPUs which support SSE2, but not in a fast way (so that
    they get the additional AV_CPU_FLAG_SSE2SLOW) are ancient
    nowadays (2007 and older), so ignore the distinction between
    the two and remove MMX and MMXEXT functions that are now
    overridden by SSE2 functions.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 3bc278618c..b59195de95 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -84,47 +84,7 @@ cglobal put_pixels8_x2, 4,5
 INIT_MMX mmxext
 PUT_PIXELS8_X2
 
-
 ; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t 
line_size, int h)
-%macro PUT_PIXELS_16 0
-cglobal put_pixels16_x2, 4,5
-    lea          r4, [r2*2]
-.loop:
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    mova         m2, [r1+8]
-    mova         m3, [r1+r2+8]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
-    PAVGB        m2, [r1+9]
-    PAVGB        m3, [r1+r2+9]
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova     [r0+8], m2
-    mova  [r0+r2+8], m3
-    add          r1, r4
-    add          r0, r4
-    mova         m0, [r1]
-    mova         m1, [r1+r2]
-    mova         m2, [r1+8]
-    mova         m3, [r1+r2+8]
-    PAVGB        m0, [r1+1]
-    PAVGB        m1, [r1+r2+1]
-    PAVGB        m2, [r1+9]
-    PAVGB        m3, [r1+r2+9]
-    add          r1, r4
-    mova       [r0], m0
-    mova    [r0+r2], m1
-    mova     [r0+8], m2
-    mova  [r0+r2+8], m3
-    add          r0, r4
-    sub         r3d, 4
-    jne .loop
-    RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_PIXELS_16
 ; The 8_X2 macro can easily be used here
 INIT_XMM sse2
 PUT_PIXELS8_X2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c190e7b473..c0913552d5 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -36,8 +36,6 @@
 
 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
                               ptrdiff_t line_size, int h);
-void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
-                               ptrdiff_t line_size, int h);
 void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h);
 void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
@@ -66,10 +64,8 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const 
uint8_t *pixels,
                                       ptrdiff_t line_size, int h);
 
 #define put_pixels8_mmx         ff_put_pixels8_mmx
-#define put_pixels16_mmx        ff_put_pixels16_mmx
 #define put_pixels8_xy2_mmx     ff_put_pixels8_xy2_mmx
 #define put_no_rnd_pixels8_mmx  ff_put_pixels8_mmx
-#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
 
 #if HAVE_INLINE_ASM
 
@@ -323,10 +319,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, 
put_no_rnd_pixels8_xy2_mmx, 8)
 #undef DEF
 #undef SET_RND
 
-#if HAVE_MMX
-CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
-#endif
-
 #endif /* HAVE_INLINE_ASM */
 
 
@@ -334,12 +326,7 @@ CALL_2X_PIXELS(put_pixels16_xy2_mmx, 
ff_put_pixels8_xy2_mmx, 8)
 
 #define HPELDSP_AVG_PIXELS16(CPUEXT)                      \
     CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 
## CPUEXT, 8) \
-    CALL_2X_PIXELS(put_pixels16_y2        ## CPUEXT, ff_put_pixels8_y2        
## CPUEXT, 8) \
-    CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 
## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_x2        ## CPUEXT, ff_avg_pixels8_x2        
## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_y2        ## CPUEXT, ff_avg_pixels8_y2        
## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_pixels16_xy2       ## CPUEXT, ff_avg_pixels8_xy2       
## CPUEXT, 8) \
-    CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT, 
ff_avg_approx_pixels8_xy2## CPUEXT, 8)
+    CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 
## CPUEXT, 8)
 
 HPELDSP_AVG_PIXELS16(_mmxext)
 
@@ -359,17 +346,12 @@ HPELDSP_AVG_PIXELS16(_mmxext)
         c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_  ## 
CPU; \
         c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_  ## 
CPU; \
     } while (0)
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)                                    
 \
-    do {                                                                       
 \
-        SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU);                                 
 \
-        SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU);                                 
 \
-    } while (0)
 
 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMX_INLINE
-    SET_HPEL_FUNCS03(put,      [0], 16, mmx);
-    SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+    SET_HPEL_FUNCS12(put_no_rnd, [0], 16, mmx);
+    c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
     SET_HPEL_FUNCS12(avg_no_rnd,  , 16, mmx);
     c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
     SET_HPEL_FUNCS03(put,      [1],  8, mmx);
@@ -380,14 +362,6 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
 {
 #if HAVE_MMXEXT_EXTERNAL
-    c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
-    c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
-
-    c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
-    c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
-    c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
-    c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-
     c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
     c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
 
@@ -399,21 +373,18 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int 
flags)
     c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
     c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
 
-    c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_mmxext;
-
     if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
         c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
         c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
         c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
         c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
         c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
     }
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
-static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
+static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
 {
 #if HAVE_SSE2_EXTERNAL
     c->put_pixels_tab[0][0]        = ff_put_pixels16_sse2;
@@ -449,8 +420,8 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int 
flags)
     if (EXTERNAL_MMXEXT(cpu_flags))
         hpeldsp_init_mmxext(c, flags);
 
-    if (EXTERNAL_SSE2_FAST(cpu_flags))
-        hpeldsp_init_sse2_fast(c, flags);
+    if (EXTERNAL_SSE2(cpu_flags))
+        hpeldsp_init_sse2(c, flags);
 
     if (EXTERNAL_SSSE3(cpu_flags))
         hpeldsp_init_ssse3(c, flags);

commit 4d691da5edb360fa043df8ce267a382cfcdaf07a
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 05:24:49 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:04 2025 +0200

    avcodec/x86/hpeldsp_init: Remove MMX functions overridden by MMXEXT
    
    Forgotten in a51279bbdea0d6db920d71980262bccd0ce78226 because
    I only looked for MMX(EXT) functions overridden by SSE2.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 6b2ad4494b..c190e7b473 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -161,38 +161,6 @@ static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, 
const uint8_t *pixels,
         :FF_REG_a, "memory");
 }
 
-static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
-{
-    MOVQ_BFE(mm6);
-    __asm__ volatile(
-        "lea    (%3, %3), %%"FF_REG_a"  \n\t"
-        ".p2align 3                     \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   1(%1), %%mm1            \n\t"
-        "movq   (%1, %3), %%mm2         \n\t"
-        "movq   1(%1, %3), %%mm3        \n\t"
-        PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4,   %%mm2, %%mm3, %%mm5)
-        "movq   %%mm4, (%2)             \n\t"
-        "movq   %%mm5, (%2, %3)         \n\t"
-        "add    %%"FF_REG_a", %1        \n\t"
-        "add    %%"FF_REG_a", %2        \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r"((x86_reg)line_size)
-        :FF_REG_a, "memory");
-}
-
 static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, 
ptrdiff_t line_size, int h)
 {
     MOVQ_BFE(mm6);
@@ -405,7 +373,7 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
     SET_HPEL_FUNCS12(avg_no_rnd,  , 16, mmx);
     c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
     SET_HPEL_FUNCS03(put,      [1],  8, mmx);
-    SET_HPEL_FUNCS(put_no_rnd, [1],  8, mmx);
+    SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
 #endif
 }
 

commit 4e2ef29cbaa258cb73f06e62435198736e493a10
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 03:43:20 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:02 2025 +0200

    tests/checkasm: Add hpeldsp checkasm
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 0a54adc96a..c41d719e82 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -12,6 +12,7 @@ AVCODECOBJS-$(CONFIG_H264CHROMA)        += h264chroma.o
 AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
 AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
 AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
+AVCODECOBJS-$(CONFIG_HPELDSP)           += hpeldsp.o
 AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
 AVCODECOBJS-$(CONFIG_LLAUDDSP)          += llauddsp.o
 AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index ad4d9b53b6..b23e4ce889 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -184,6 +184,9 @@ static const struct {
         { "hevc_pel", checkasm_check_hevc_pel },
         { "hevc_sao", checkasm_check_hevc_sao },
     #endif
+    #if CONFIG_HPELDSP
+        { "hpeldsp", checkasm_check_hpeldsp },
+    #endif
     #if CONFIG_HUFFYUV_DECODER
         { "huffyuvdsp", checkasm_check_huffyuvdsp },
     #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 1684c427d6..0f02c4fb6d 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -110,6 +110,7 @@ void checkasm_check_hevc_deblock(void);
 void checkasm_check_hevc_idct(void);
 void checkasm_check_hevc_pel(void);
 void checkasm_check_hevc_sao(void);
+void checkasm_check_hpeldsp(void);
 void checkasm_check_huffyuvdsp(void);
 void checkasm_check_idctdsp(void);
 void checkasm_check_idet(void);
diff --git a/tests/checkasm/hpeldsp.c b/tests/checkasm/hpeldsp.c
new file mode 100644
index 0000000000..ba290b3ab8
--- /dev/null
+++ b/tests/checkasm/hpeldsp.c
@@ -0,0 +1,115 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "checkasm.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem_internal.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+
+#define MAX_BLOCK_SIZE 16
+#define MAX_HEIGHT     16
+#define MAX_STRIDE     64
+// BUF_SIZE is bigger than necessary in order to test strides > block width.
+#define BUF_SIZE ((MAX_HEIGHT - 1) * MAX_STRIDE + MAX_BLOCK_SIZE)
+// Due to hpel interpolation the input needs to have one more line than
+// the output and the last line needs one more element.
+// The input is not subject to alignment requirements; making the input buffer
+// bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random misalignment.
+#define INPUT_BUF_SIZE (MAX_HEIGHT * MAX_STRIDE + MAX_BLOCK_SIZE + 1 + 
(MAX_BLOCK_SIZE - 1))
+
+#define randomize_buffers(buf0, buf1)                      \
+    do {                                                   \
+        static_assert(sizeof(buf0) == sizeof(buf1), "Incompatible buffers"); \
+        static_assert(!(sizeof(buf0) % 4), "Tail handling needed"); \
+        static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
+                      "Pointer arithmetic needs to be adapted"); \
+        for (size_t k = 0; k < sizeof(buf0); k += 4) {     \
+            uint32_t r = rnd();                            \
+            AV_WN32A(buf0 + k, r);                         \
+            AV_WN32A(buf1 + k, r);                         \
+        }                                                  \
+    } while (0)
+
+
+void checkasm_check_hpeldsp(void)
+{
+    DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf0)[INPUT_BUF_SIZE];
+    DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf1)[INPUT_BUF_SIZE];
+    DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf0)[BUF_SIZE];
+    DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf1)[BUF_SIZE];
+    HpelDSPContext hdsp;
+    static const struct {
+        const char *name;
+        size_t offset;
+        unsigned nb_blocksizes;
+    } tests[] = {
+#define TEST(NAME, NB) { .name = #NAME, .offset = offsetof(HpelDSPContext, 
NAME), .nb_blocksizes = NB }
+        TEST(put_pixels_tab, 4),
+        TEST(avg_pixels_tab, 4),
+        TEST(put_no_rnd_pixels_tab, 2), // put_no_rnd_pixels_tab only has two 
usable blocksizes
+        TEST(avg_no_rnd_pixels_tab, 1),
+    };
+    declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t 
*dst, const uint8_t *src, ptrdiff_t stride, int h);
+
+    ff_hpeldsp_init(&hdsp, AV_CODEC_FLAG_BITEXACT);
+
+    for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+        op_pixels_func (*func_tab)[4] = (op_pixels_func (*)[4])((char*)&hdsp + 
tests[i].offset);
+        for (unsigned j = 0; j < tests[i].nb_blocksizes; ++j) {
+            const unsigned blocksize = MAX_BLOCK_SIZE >> j;
+            // h must always be a multiple of four, except when width is two 
or four.
+            const unsigned h_mult = blocksize <= 4 ? 2 : 4;
+
+            for (unsigned dxy = 0; dxy < 4; ++dxy) {
+                if (check_func(func_tab[j][dxy], "%s[%u][%u]", tests[i].name, 
j, dxy)) {
+                    // Don't always use output that is 16-aligned.
+                    size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize)) 
* blocksize;
+                    size_t src_offset = rnd() % MAX_BLOCK_SIZE;
+                    ptrdiff_t stride  = (rnd() % (MAX_STRIDE / blocksize) + 1) 
* blocksize;
+                    int h = (rnd() % (MAX_HEIGHT / h_mult) + 1) * h_mult;
+                    const uint8_t *src0 = srcbuf0 + src_offset, *src1 = 
srcbuf1 + src_offset;
+                    uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 + 
dst_offset;
+
+                    if (rnd() & 1) {
+                        // Flip stride.
+                        dst1  += (h - 1) * stride;
+                        dst0  += (h - 1) * stride;
+                        // Due to interpolation potentially h + 1 lines are 
read
+                        // from src, hence h * stride.
+                        src0  += h * stride;
+                        src1  += h * stride;
+                        stride = -stride;
+                    }
+
+                    randomize_buffers(srcbuf0, srcbuf1);
+                    randomize_buffers(dstbuf0, dstbuf1);
+                    call_ref(dst0, src0, stride, h);
+                    call_new(dst1, src1, stride, h);
+                    if (memcmp(srcbuf0, srcbuf1, sizeof(srcbuf0)) || 
memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0)))
+                        fail();
+                    bench_new(dst0, src0, stride, h);
+                }
+            }
+        }
+    }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 56476d254c..7570c89ad9 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp                       
          \
                 fate-checkasm-hevc_idct                                 \
                 fate-checkasm-hevc_pel                                  \
                 fate-checkasm-hevc_sao                                  \
+                fate-checkasm-hpeldsp                                   \
                 fate-checkasm-huffyuvdsp                                \
                 fate-checkasm-idctdsp                                   \
                 fate-checkasm-jpeg2000dsp                               \

commit fcb9e0b5f019ec46dffb6d769793ccb7d884fb14
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 06:11:43 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:56 2025 +0200

    avcodec/hpel{dsp,_template}: Use ptrdiff_t for strides
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index 77ebcd74a2..67bee665a9 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -29,9 +29,9 @@
 static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst,            \
                                                const uint8_t *src1,     \
                                                const uint8_t *src2,     \
-                                               int dst_stride,          \
-                                               int src_stride1,         \
-                                               int src_stride2,         \
+                                               ptrdiff_t dst_stride,    \
+                                               ptrdiff_t src_stride1,   \
+                                               ptrdiff_t src_stride2,   \
                                                int h)                   \
 {                                                                       \
     int i;                                                              \
@@ -50,9 +50,9 @@ static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst,  
          \
 static inline void FUNC(OPNAME ## _pixels4_l2)(uint8_t *dst,            \
                                                const uint8_t *src1,     \
                                                const uint8_t *src2,     \
-                                               int dst_stride,          \
-                                               int src_stride1,         \
-                                               int src_stride2,         \
+                                               ptrdiff_t dst_stride,    \
+                                               ptrdiff_t src_stride1,   \
+                                               ptrdiff_t src_stride2,   \
                                                int h)                   \
 {                                                                       \
     int i;                                                              \
@@ -67,9 +67,9 @@ static inline void FUNC(OPNAME ## _pixels4_l2)(uint8_t *dst,  
          \
 static inline void FUNC(OPNAME ## _pixels2_l2)(uint8_t *dst,            \
                                                const uint8_t *src1,     \
                                                const uint8_t *src2,     \
-                                               int dst_stride,          \
-                                               int src_stride1,         \
-                                               int src_stride2,         \
+                                               ptrdiff_t dst_stride,    \
+                                               ptrdiff_t src_stride1,   \
+                                               ptrdiff_t src_stride2,   \
                                                int h)                   \
 {                                                                       \
     int i;                                                              \
@@ -84,9 +84,9 @@ static inline void FUNC(OPNAME ## _pixels2_l2)(uint8_t *dst,  
          \
 static inline void FUNC(OPNAME ## _pixels16_l2)(uint8_t *dst,           \
                                                 const uint8_t *src1,    \
                                                 const uint8_t *src2,    \
-                                                int dst_stride,         \
-                                                int src_stride1,        \
-                                                int src_stride2,        \
+                                                ptrdiff_t dst_stride,   \
+                                                ptrdiff_t src_stride1,  \
+                                                ptrdiff_t src_stride2,  \
                                                 int h)                  \
 {                                                                       \
     FUNC(OPNAME ## _pixels8_l2)(dst, src1, src2, dst_stride,            \
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index db0e02ee93..688939ad3f 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -39,9 +39,9 @@
 static inline void OPNAME ## _no_rnd_pixels8_l2_8(uint8_t *dst,         \
                                                   const uint8_t *src1,  \
                                                   const uint8_t *src2,  \
-                                                  int dst_stride,       \
-                                                  int src_stride1,      \
-                                                  int src_stride2,      \
+                                                  ptrdiff_t dst_stride, \
+                                                  ptrdiff_t src_stride1,\
+                                                  ptrdiff_t src_stride2,\
                                                   int h)                \
 {                                                                       \
     int i;                                                              \

commit 89f2016ece77868cc1982ae104d56b25aaf519c3
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 05:34:37 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:52 2025 +0200

    avcodec/hpel_template: Fix unintentional usage of unsigned offsets
    
    The value of sizeof() is of type size_t which means that
    an expression like
    src1[i * src_stride1 + 4 * (int)sizeof(pixel)]
    will use a very large offset if src_stride1 is sufficiently negative.
    It works in practice (because it is correct modulo SIZE_MAX),
    but UBSan treats it as error:
    libavcodec/hpel_template.c:104:1: runtime error: addition of unsigned 
offset to 0x7ffdfa0391d8 overflowed to 0x7ffdfa0391cc
    Fix this by casting sizeof(pixel) to int.
    
    (This has been uncovered by a checkasm test for the hpeldsp
    which will be added in a later commit.)
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index fccfe7610f..77ebcd74a2 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -40,9 +40,9 @@ static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst,  
          \
         a = AV_RN4P(&src1[i * src_stride1]);                            \
         b = AV_RN4P(&src2[i * src_stride2]);                            \
         OP(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b));   \
-        a = AV_RN4P(&src1[i * src_stride1 + 4 * sizeof(pixel)]);        \
-        b = AV_RN4P(&src2[i * src_stride2 + 4 * sizeof(pixel)]);        \
-        OP(*((pixel4 *) &dst[i * dst_stride + 4 * sizeof(pixel)]),      \
+        a = AV_RN4P(&src1[i * src_stride1 + 4 * (int)sizeof(pixel)]);   \
+        b = AV_RN4P(&src2[i * src_stride2 + 4 * (int)sizeof(pixel)]);   \
+        OP(*((pixel4 *) &dst[i * dst_stride + 4 * (int)sizeof(pixel)]), \
            rnd_avg_pixel4(a, b));                                       \
     }                                                                   \
 }                                                                       \

commit b316a1bdd122ca1bcb43b20dbd6bc9c244f98cfe
Author:     Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 22:51:18 2025 +0200
Commit:     Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:30 2025 +0200

    avcodec/hpeldsp: Fix documentation
    
    This commit fixes two issues in the documentation:
    a) The documentation for {put,avg}_pixels_tab only mentions
    widths 16 and 8, although it explicitly mentions that there
    are four horizontal blocksizes. This part of the patch
    basically reverts e5771f4f37b67951485205e110f4da5e7e32ea74.
    b) The restrictions on height don't match the reality. While
    most users abide by it, some do not:
    i) vp56.c copies a 16x12 block.
    ii) indeo3 can copy an arbitrary multiple of four lines
    for block widths 4, 8 and 16.
    iii) SVQ3 can use block sizes luma block sizes 16x16, 8x16,
    16x8, 8x8, 4x8, 8x4 and 4x4 and the corresponding
    8x8, 4x8, 8x4, 4x4, 2x4, 4x2 and 2x2 chroma block sizes.
    
    This implies that for widths 2 and 4 height can be two
    and is guaranteed to be at least even. For all other widths,
    height can be a multiple of four.
    
    Furthermore, a comment for the SVQ3 blocksizes has been added.
    
    Reviewed-by: Lynne <[email protected]>
    Signed-off-by: Andreas Rheinhardt <[email protected]>

diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 41a46f0760..1f6a165bf6 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -31,11 +31,12 @@
 #include <stdint.h>
 #include <stddef.h>
 
-/* add and put pixel (decoding) */
-// blocksizes for hpel_pixels_func are 8x4,8x8 16x8 16x16
-// h for hpel_pixels_func is limited to {width/2, width} but never larger
-// than 16 and never smaller than 4
-typedef void (*op_pixels_func)(uint8_t *block /*align width (8 or 16)*/,
+/**
+ * Average and put pixel
+ * Widths can be 16, 8, 4 or 2. For for widths 2 and 4, h is always a positive
+ * multiple of 2; otherwise, it is a positive multiple of 4.
+ */
+typedef void (*op_pixels_func)(uint8_t *block /* align width */,
                                const uint8_t *pixels /*align 1*/,
                                ptrdiff_t line_size, int h);
 
@@ -46,8 +47,8 @@ typedef struct HpelDSPContext {
     /**
      * Halfpel motion compensation with rounding (a+b+1)>>1.
      * this is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH 2->4xH 3->2xH ][ xhalfpel + 2*yhalfpel ]
      * @param block destination where the result is stored
      * @param pixels source
      * @param line_size number of bytes in a horizontal line of block
@@ -58,8 +59,8 @@ typedef struct HpelDSPContext {
     /**
      * Halfpel motion compensation with rounding (a+b+1)>>1.
      * This is an array[4][4] of motion compensation functions for 4
-     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
-     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+     * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br>
+     * *pixels_tab[ 0->16xH 1->8xH 2->4xH 3->2xH ][ xhalfpel + 2*yhalfpel ]
      * @param block destination into which the result is averaged (a+b+1)>>1
      * @param pixels source
      * @param line_size number of bytes in a horizontal line of block
@@ -85,7 +86,7 @@ typedef struct HpelDSPContext {
      * Halfpel motion compensation with no rounding (a+b)>>1.
      * this is an array[4] of motion compensation functions for 1
      * horizontal blocksize (16) and the 4 halfpel positions<br>
-     * *pixels_tab[0][ xhalfpel + 2*yhalfpel ]
+     * *pixels_tab[ xhalfpel + 2*yhalfpel ]
      * @param block destination into which the result is averaged (a+b)>>1
      * @param pixels source
      * @param line_size number of bytes in a horizontal line of block
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 4c4f3018c5..dfcfce77d3 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -504,6 +504,7 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int 
mode,
                               int dir, int avg)
 {
     int i, j, k, mx, my, dx, dy, x, y;
+    // 0->16x16,1->8x16,2->16x8,3->8x8,4->4x8,5->8x4,6->4x4
     const int part_width    = ((size & 5) == 4) ? 4 : 16 >> (size & 1);
     const int part_height   = 16 >> ((unsigned)(size + 1) / 3);
     const int extra_width   = (mode == PREDICT_MODE) ? -16 * 6 : 0;

-----------------------------------------------------------------------

Summary of changes:
 libavcodec/hpel_template.c                 |  30 +--
 libavcodec/hpeldsp.c                       |   6 +-
 libavcodec/hpeldsp.h                       |  27 +--
 libavcodec/svq3.c                          |   1 +
 libavcodec/x86/fpel.asm                    |   2 -
 libavcodec/x86/fpel.h                      |   4 -
 libavcodec/x86/h264_qpel.c                 |  46 +---
 libavcodec/x86/h264_qpel_8bit.asm          |  60 -----
 libavcodec/x86/hpeldsp.asm                 | 202 ++++++----------
 libavcodec/x86/hpeldsp.h                   |   8 -
 libavcodec/x86/hpeldsp_init.c              | 364 ++++++++---------------------
 libavcodec/x86/mpegvideoenc_qns_template.c | 109 ---------
 libavcodec/x86/mpegvideoencdsp_init.c      | 150 ++++++------
 libavcodec/x86/qpeldsp_init.c              |  41 ++--
 libavcodec/x86/rnd_template.c              |  98 --------
 libavcodec/x86/rv40dsp_init.c              |  12 -
 libavcodec/x86/vvc/sao_10bit.asm           |  38 ---
 libavfilter/vf_gradfun.c                   |   2 -
 libavfilter/x86/vf_gradfun.asm             |  42 +---
 libavfilter/x86/vf_gradfun_init.c          |  22 --
 tests/checkasm/Makefile                    |   1 +
 tests/checkasm/checkasm.c                  |   3 +
 tests/checkasm/checkasm.h                  |   1 +
 tests/checkasm/hpeldsp.c                   | 115 +++++++++
 tests/fate/checkasm.mak                    |   1 +
 25 files changed, 440 insertions(+), 945 deletions(-)
 delete mode 100644 libavcodec/x86/mpegvideoenc_qns_template.c
 delete mode 100644 libavcodec/x86/rnd_template.c
 create mode 100644 tests/checkasm/hpeldsp.c


hooks/post-receive
-- 

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] branch master updated. a54d6b1d91 avcodec/x86/rnd_template: Merge into hpeldsp_init.c

Reply via email to