The branch, master has been updated
via a54d6b1d91ba17f6e1316997dd5f0ced4cee8ee5 (commit)
via 43fe9554cc7998fbe0bae455c7b374e76a4d253f (commit)
via 00e046df132fd1751c50798334b985beec89661f (commit)
via 30c4007c65e1f73de5ce1b5eb459c71e0b21389a (commit)
via 1e677e696488d52068e83c669ae871caa7c34583 (commit)
via 262791b8d8c7a5e3df44c8784de192857e67d52f (commit)
via c7161befb4ae7d0f40e35676f52507e7de1c8b01 (commit)
via 4fc05c28f426d6073e6e15db334b0c88ff925f1d (commit)
via 5ef613bcb0508f16bd5b190168183326391de9b0 (commit)
via 6a47ea5f9fdaedd6aa4bc8723c86a0c7a30d8ed1 (commit)
via 918d37d9d156f15b63952a22bfba0541dd087129 (commit)
via e86f137514fb8a69cf145f26c83b1b053c727b52 (commit)
via 2cf9e733c6a666600423a0967f23341d9f09e3c8 (commit)
via 1f9ef6a8dc6e57b360cf53dd644fde1936ad3047 (commit)
via 8a7858dacf50797c7b81aad119e8811a849d0552 (commit)
via 4d691da5edb360fa043df8ce267a382cfcdaf07a (commit)
via 4e2ef29cbaa258cb73f06e62435198736e493a10 (commit)
via fcb9e0b5f019ec46dffb6d769793ccb7d884fb14 (commit)
via 89f2016ece77868cc1982ae104d56b25aaf519c3 (commit)
via b316a1bdd122ca1bcb43b20dbd6bc9c244f98cfe (commit)
from baace56169a8cea7b44d727bdf656110aace011d (commit)
- Log -----------------------------------------------------------------
commit a54d6b1d91ba17f6e1316997dd5f0ced4cee8ee5
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 05:01:41 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:58 2025 +0200
avcodec/x86/rnd_template: Merge into hpeldsp_init.c
It is now only included exactly once.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 66ed886ea9..cb47cb7752 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -33,6 +33,7 @@
#include "libavcodec/pixels.h"
#include "fpel.h"
#include "hpeldsp.h"
+#include "inline_asm.h"
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
@@ -73,15 +74,74 @@ void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t
*pixels,
/***********************************/
/* MMX no rounding */
-#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
-#define SET_RND MOVQ_WONE
-#define STATIC static
-#include "rnd_template.c"
+// put_pixels
+static void put_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h)
+{
+ MOVQ_ZERO(mm7);
+ MOVQ_WONE(mm6); // =1 for no_rnd version
+ __asm__ volatile(
+ "movq (%1), %%mm0 \n\t"
+ "movq 1(%1), %%mm4 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
+ "add %3, %1 \n\t"
+ ".p2align 3 \n\t"
+ "1: \n\t"
+ "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
+ "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpckhbw %%mm7, %%mm1 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "paddusw %%mm2, %%mm0 \n\t"
+ "paddusw %%mm3, %%mm1 \n\t"
+ "paddusw %%mm6, %%mm4 \n\t"
+ "paddusw %%mm6, %%mm5 \n\t"
+ "paddusw %%mm0, %%mm4 \n\t"
+ "paddusw %%mm1, %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm5 \n\t"
+ "packuswb %%mm5, %%mm4 \n\t"
+ "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
+
+ "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
+ "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq %%mm4, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpckhbw %%mm7, %%mm3 \n\t"
+ "punpckhbw %%mm7, %%mm5 \n\t"
+ "paddusw %%mm2, %%mm4 \n\t"
+ "paddusw %%mm3, %%mm5 \n\t"
+ "paddusw %%mm6, %%mm0 \n\t"
+ "paddusw %%mm6, %%mm1 \n\t"
+ "paddusw %%mm4, %%mm0 \n\t"
+ "paddusw %%mm5, %%mm1 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm1 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
+ "add %3, %%"FF_REG_a" \n\t"
-#undef DEF
-#undef SET_RND
-#undef STATIC
+ "subl $2, %0 \n\t"
+ "jnz 1b \n\t"
+ :"+g"(h), "+S"(pixels)
+ :"D"(block), "r"((x86_reg)line_size)
+ :FF_REG_a, "memory");
+}
// this routine is 'slightly' suboptimal but mostly unused
static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c
deleted file mode 100644
index 4590aeddf0..0000000000
--- a/libavcodec/x86/rnd_template.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
- * Copyright (c) 2000, 2001 Fabrice Bellard
- * Copyright (c) 2003-2004 Michael Niedermayer <[email protected]>
- *
- * MMX optimization by Nick Kurshev <[email protected]>
- * mostly rewritten by Michael Niedermayer <[email protected]>
- * and improved by Zdenek Kabelac <[email protected]>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "inline_asm.h"
-
-// put_pixels
-av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t
*pixels,
- ptrdiff_t line_size, int h)
-{
- MOVQ_ZERO(mm7);
- SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
- __asm__ volatile(
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm4 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
- "add %3, %1 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
- "movq %%mm0, %%mm1 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "punpcklbw %%mm7, %%mm0 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpckhbw %%mm7, %%mm1 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "paddusw %%mm2, %%mm0 \n\t"
- "paddusw %%mm3, %%mm1 \n\t"
- "paddusw %%mm6, %%mm4 \n\t"
- "paddusw %%mm6, %%mm5 \n\t"
- "paddusw %%mm0, %%mm4 \n\t"
- "paddusw %%mm1, %%mm5 \n\t"
- "psrlw $2, %%mm4 \n\t"
- "psrlw $2, %%mm5 \n\t"
- "packuswb %%mm5, %%mm4 \n\t"
- "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
- "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
- "movq %%mm2, %%mm3 \n\t"
- "movq %%mm4, %%mm5 \n\t"
- "punpcklbw %%mm7, %%mm2 \n\t"
- "punpcklbw %%mm7, %%mm4 \n\t"
- "punpckhbw %%mm7, %%mm3 \n\t"
- "punpckhbw %%mm7, %%mm5 \n\t"
- "paddusw %%mm2, %%mm4 \n\t"
- "paddusw %%mm3, %%mm5 \n\t"
- "paddusw %%mm6, %%mm0 \n\t"
- "paddusw %%mm6, %%mm1 \n\t"
- "paddusw %%mm4, %%mm0 \n\t"
- "paddusw %%mm5, %%mm1 \n\t"
- "psrlw $2, %%mm0 \n\t"
- "psrlw $2, %%mm1 \n\t"
- "packuswb %%mm1, %%mm0 \n\t"
- "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
- "add %3, %%"FF_REG_a" \n\t"
-
- "subl $2, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels)
- :"D"(block), "r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
commit 43fe9554cc7998fbe0bae455c7b374e76a4d253f
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:49:44 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:55 2025 +0200
avcodec/x86/hpeldsp_init: Avoid complicating macro
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 48a1aa7a2c..66ed886ea9 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -69,8 +69,6 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t
*pixels,
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
-
#if HAVE_INLINE_ASM
/***********************************/
@@ -167,25 +165,16 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx,
put_no_rnd_pixels8_xy2_mmx, 8)
#endif
#endif /* HAVE_INLINE_ASM */
-#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
- if (HAVE_MMX_EXTERNAL) \
- c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU
-
-#define SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU)
\
- do {
\
- SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU);
\
- c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ##
CPU; \
- } while (0)
-
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_INLINE
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
+ c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
+#endif
#if HAVE_MMX_EXTERNAL
+ c->put_no_rnd_pixels_tab[1][0] =
c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
-#endif
- SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
#endif
}
commit 00e046df132fd1751c50798334b985beec89661f
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:32:55 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:53 2025 +0200
avcodec/x86/hpeldsp_init: Remove MMX(EXT) funcs overridden by SSE2
This affects the {avg,put}_no_rnd_pixels16_{x,y}2 MMX and
(put-only) MMXEXT versions. Removing these functions saved
1184B here.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 4f369c9731..48a1aa7a2c 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -161,167 +161,12 @@ static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block,
const uint8_t *pixels,
:FF_REG_a, "memory");
}
-static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%1, %3), %%mm2 \n\t"
- "movq 9(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, 8(%2) \n\t"
- "movq %%mm5, 8(%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm2\n\t"
- PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"),%%mm0\n\t"
- PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
-static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, (%2) \n\t"
- "movq 8(%1), %%mm0 \n\t"
- "movq 9(%1), %%mm1 \n\t"
- "movq 8(%2), %%mm3 \n\t"
- PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6)
- PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
- "movq %%mm0, 8(%2) \n\t"
- "add %3, %1 \n\t"
- "add %3, %2 \n\t"
- "subl $1, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :"memory");
-}
-
-static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- "movq (%1), %%mm0 \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
- PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm0, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
-
- "movq (%1, %3), %%mm1 \n\t"
- "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
- PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
- "movq (%2), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
- "movq (%2, %3), %%mm3 \n\t"
- PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
- "movq %%mm2, (%2) \n\t"
- "movq %%mm1, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
-
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
#if HAVE_MMX
-CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8)
-CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8)
-
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#endif
#endif /* HAVE_INLINE_ASM */
-
-#if HAVE_X86ASM
-
-#define HPELDSP_AVG_PIXELS16(CPUEXT) \
- CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2
## CPUEXT, 8) \
- CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2
## CPUEXT, 8)
-
-HPELDSP_AVG_PIXELS16(_mmxext)
-
-#endif /* HAVE_X86ASM */
-
#define SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU) \
if (HAVE_MMX_EXTERNAL) \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU
@@ -331,18 +176,11 @@ HPELDSP_AVG_PIXELS16(_mmxext)
SET_HPEL_FUNCS_EXT(PFX, IDX, SIZE, CPU);
\
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ##
CPU; \
} while (0)
-#define SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU)
\
- do {
\
- c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ##
CPU; \
- c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ##
CPU; \
- } while (0)
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_INLINE
- SET_HPEL_FUNCS12(put_no_rnd, [0], 16, mmx);
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
- SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx);
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
#if HAVE_MMX_EXTERNAL
c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
@@ -365,8 +203,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int
flags)
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
- c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
- c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
}
commit 30c4007c65e1f73de5ce1b5eb459c71e0b21389a
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 04:15:22 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:49 2025 +0200
avcodec/x86/hpeldsp: Add SSE2 avg_no_rnd size 16 versions
These currently only exist as MMX versions.
The added functions occupy 320B here. So far, they are only for
the x2 and y2 (i.e. right and down, not down-right) directions.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 522a349e21..e9f988f7b5 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -125,12 +125,12 @@ cglobal put_no_rnd_pixels8_x2, 4,5
RET
-%macro NO_RND_PIXELS_X2 0
+%macro NO_RND_PIXELS_X2 1
%if cpuflag(sse2)
-cglobal put_no_rnd_pixels16_x2, 4,5,5
+cglobal %1_no_rnd_pixels16_x2, 4,5,5
%else
; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-cglobal put_no_rnd_pixels8_x2_exact, 4,5
+cglobal %1_no_rnd_pixels8_x2_exact, 4,5
%endif
lea r4, [r2*3]
pcmpeqb m4, m4
@@ -147,6 +147,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
PAVGB m2, m3
pxor m0, m4
pxor m2, m4
+%ifidn %1, avg
+ pavgb m0, [r0]
+ pavgb m2, [r0+r2]
+%endif
mova [r0], m0
mova [r0+r2], m2
movu m0, [r1+r2*2]
@@ -161,6 +165,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
PAVGB m2, m3
pxor m0, m4
pxor m2, m4
+%ifidn %1, avg
+ pavgb m0, [r0+r2*2]
+ pavgb m2, [r0+r4]
+%endif
mova [r0+r2*2], m0
mova [r0+r4], m2
lea r1, [r1+r2*4]
@@ -171,9 +179,10 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
%endmacro
INIT_MMX mmxext
-NO_RND_PIXELS_X2
+NO_RND_PIXELS_X2 put
INIT_XMM sse2
-NO_RND_PIXELS_X2
+NO_RND_PIXELS_X2 avg
+NO_RND_PIXELS_X2 put
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro PUT_PIXELS8_Y2 0
@@ -245,12 +254,12 @@ cglobal put_no_rnd_pixels8_y2, 4,5
RET
-%macro NO_RND_PIXELS_Y2 0
+%macro NO_RND_PIXELS_Y2 1
%if cpuflag(sse2)
-cglobal put_no_rnd_pixels16_y2, 4,5,4
+cglobal %1_no_rnd_pixels16_y2, 4,5,4
%else
; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-cglobal put_no_rnd_pixels8_y2_exact, 4,5
+cglobal %1_no_rnd_pixels8_y2_exact, 4,5
%endif
lea r4, [r2*3]
movu m0, [r1]
@@ -266,6 +275,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
PAVGB m1, m2
pxor m0, m3
pxor m1, m3
+%ifidn %1, avg
+ pavgb m0, [r0]
+ pavgb m1, [r0+r2]
+%endif
mova [r0], m0
mova [r0+r2], m1
movu m1, [r1+r2*2]
@@ -276,6 +289,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
PAVGB m1, m0
pxor m2, m3
pxor m1, m3
+%ifidn %1, avg
+ pavgb m2,[r0+r2*2]
+ pavgb m1,[r0+r4]
+%endif
mova [r0+r2*2], m2
mova [r0+r4], m1
lea r1, [r1+r2*4]
@@ -286,9 +303,10 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
%endmacro
INIT_MMX mmxext
-NO_RND_PIXELS_Y2
+NO_RND_PIXELS_Y2 put
INIT_XMM sse2
-NO_RND_PIXELS_Y2
+NO_RND_PIXELS_Y2 avg
+NO_RND_PIXELS_Y2 put
; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro AVG_PIXELS8_X2 0
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c8ccd7b011..4f369c9731 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -51,6 +51,8 @@ void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -60,6 +62,8 @@ void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_avg_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -385,7 +389,10 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_sse2;
+
c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_sse2;
+ c->avg_no_rnd_pixels_tab[1] = ff_avg_no_rnd_pixels16_x2_sse2;
+ c->avg_no_rnd_pixels_tab[2] = ff_avg_no_rnd_pixels16_y2_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
commit 1e677e696488d52068e83c669ae871caa7c34583
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 03:52:28 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:46 2025 +0200
avcodec/x86/hpeldsp: Add SSE2 put_no_rnd size 16 versions
These currently only exist as MMX and (not bitexact) MMXEXT versions.
The added functions occupy 288B here. So far, they are only for
the x2 and y2 (i.e. right and down, not down-right) directions.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 859894856d..522a349e21 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -125,38 +125,42 @@ cglobal put_no_rnd_pixels8_x2, 4,5
RET
+%macro NO_RND_PIXELS_X2 0
+%if cpuflag(sse2)
+cglobal put_no_rnd_pixels16_x2, 4,5,5
+%else
; void ff_put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-INIT_MMX mmxext
cglobal put_no_rnd_pixels8_x2_exact, 4,5
+%endif
lea r4, [r2*3]
- pcmpeqb m6, m6
+ pcmpeqb m4, m4
.loop:
- mova m0, [r1]
- mova m2, [r1+r2]
- mova m1, [r1+1]
- mova m3, [r1+r2+1]
- pxor m0, m6
- pxor m2, m6
- pxor m1, m6
- pxor m3, m6
+ movu m0, [r1]
+ movu m2, [r1+r2]
+ movu m1, [r1+1]
+ movu m3, [r1+r2+1]
+ pxor m0, m4
+ pxor m2, m4
+ pxor m1, m4
+ pxor m3, m4
PAVGB m0, m1
PAVGB m2, m3
- pxor m0, m6
- pxor m2, m6
+ pxor m0, m4
+ pxor m2, m4
mova [r0], m0
mova [r0+r2], m2
- mova m0, [r1+r2*2]
- mova m1, [r1+r2*2+1]
- mova m2, [r1+r4]
- mova m3, [r1+r4+1]
- pxor m0, m6
- pxor m1, m6
- pxor m2, m6
- pxor m3, m6
+ movu m0, [r1+r2*2]
+ movu m1, [r1+r2*2+1]
+ movu m2, [r1+r4]
+ movu m3, [r1+r4+1]
+ pxor m0, m4
+ pxor m1, m4
+ pxor m2, m4
+ pxor m3, m4
PAVGB m0, m1
PAVGB m2, m3
- pxor m0, m6
- pxor m2, m6
+ pxor m0, m4
+ pxor m2, m4
mova [r0+r2*2], m0
mova [r0+r4], m2
lea r1, [r1+r2*4]
@@ -164,7 +168,12 @@ cglobal put_no_rnd_pixels8_x2_exact, 4,5
sub r3d, 4
jg .loop
RET
+%endmacro
+INIT_MMX mmxext
+NO_RND_PIXELS_X2
+INIT_XMM sse2
+NO_RND_PIXELS_X2
; void ff_put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro PUT_PIXELS8_Y2 0
@@ -236,33 +245,37 @@ cglobal put_no_rnd_pixels8_y2, 4,5
RET
+%macro NO_RND_PIXELS_Y2 0
+%if cpuflag(sse2)
+cglobal put_no_rnd_pixels16_y2, 4,5,4
+%else
; void ff_put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-INIT_MMX mmxext
cglobal put_no_rnd_pixels8_y2_exact, 4,5
+%endif
lea r4, [r2*3]
- mova m0, [r1]
- pcmpeqb m6, m6
+ movu m0, [r1]
+ pcmpeqb m3, m3
add r1, r2
- pxor m0, m6
+ pxor m0, m3
.loop:
- mova m1, [r1]
- mova m2, [r1+r2]
- pxor m1, m6
- pxor m2, m6
+ movu m1, [r1]
+ movu m2, [r1+r2]
+ pxor m1, m3
+ pxor m2, m3
PAVGB m0, m1
PAVGB m1, m2
- pxor m0, m6
- pxor m1, m6
+ pxor m0, m3
+ pxor m1, m3
mova [r0], m0
mova [r0+r2], m1
- mova m1, [r1+r2*2]
- mova m0, [r1+r4]
- pxor m1, m6
- pxor m0, m6
+ movu m1, [r1+r2*2]
+ movu m0, [r1+r4]
+ pxor m1, m3
+ pxor m0, m3
PAVGB m2, m1
PAVGB m1, m0
- pxor m2, m6
- pxor m1, m6
+ pxor m2, m3
+ pxor m1, m3
mova [r0+r2*2], m2
mova [r0+r4], m1
lea r1, [r1+r2*4]
@@ -270,7 +283,12 @@ cglobal put_no_rnd_pixels8_y2_exact, 4,5
sub r3d, 4
jg .loop
RET
+%endmacro
+INIT_MMX mmxext
+NO_RND_PIXELS_Y2
+INIT_XMM sse2
+NO_RND_PIXELS_Y2
; void ff_avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro AVG_PIXELS8_X2 0
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index ab32b825c9..c8ccd7b011 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -49,6 +49,8 @@ void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const
uint8_t *pixels,
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -56,6 +58,8 @@ void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const
uint8_t *pixels,
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
+void ff_put_no_rnd_pixels16_y2_sse2(uint8_t *block, const uint8_t *pixels,
+ ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
@@ -369,10 +373,14 @@ static void hpeldsp_init_sse2(HpelDSPContext *c, int
flags)
{
#if HAVE_SSE2_EXTERNAL
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
- c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_sse2;
c->put_pixels_tab[0][2] = ff_put_pixels16_y2_sse2;
c->put_pixels_tab[0][3] = ff_put_pixels16_xy2_sse2;
+
+ c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
+ c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_sse2;
+ c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_sse2;
+
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_sse2;
c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_sse2;
commit 262791b8d8c7a5e3df44c8784de192857e67d52f
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 05:41:04 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:43 2025 +0200
avcodec/hpeldsp: Make put_no_rnd_pixels_tab smaller
Only the blocksizes 16 and 8 are implemented, yet the motion estimation
code touches the blocksize 4 entries. But really nothing touches
the blocksize 2 entries, so that we can reduce the put_no_rnd_pixels_tab
array size to [3][4].
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 1f6a165bf6..6c9fdce0c1 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -77,10 +77,10 @@ typedef struct HpelDSPContext {
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
* @param h height
- * @note The size is kept at [4][4] to match the above pixel_tabs and avoid
- * out of bounds reads in the motion estimation code.
+ * @note The size is kept at [3][4] to avoid out of bounds accesses
+ * in the motion estimation code.
*/
- op_pixels_func put_no_rnd_pixels_tab[4][4];
+ op_pixels_func put_no_rnd_pixels_tab[3][4];
/**
* Halfpel motion compensation with no rounding (a+b)>>1.
commit c7161befb4ae7d0f40e35676f52507e7de1c8b01
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 15:12:49 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:39 2025 +0200
avcodec/x86/h264_qpel: Remove MMX(EXT) funcs overridden by SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index 69ffd001e0..18d80a52f6 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -46,12 +46,10 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t
*src1, const uint8_t
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-#define ff_put_pixels8_mmxext(...)
#define ff_put_pixels4_mmxext(...)
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, const uint8_t
*src, int dstStride, int srcStride);\
-void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, const uint8_t
*src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, const uint8_t
*src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, const
uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, const
uint8_t *src, const uint8_t *src2, int dstStride, int src2Stride);\
@@ -91,15 +89,6 @@ static av_always_inline void ff_ ## OPNAME ##
h264_qpel8or16_hv2_lowpass_ ## MMX
}while(w--);\
}\
\
-static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ##
MMX(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride){\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride,
srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride,
srcStride);\
- src += 8*srcStride;\
- dst += 8*dstStride;\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride,
srcStride);\
- ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride,
srcStride);\
-}\
-\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ##
MMX(uint8_t *dst, const uint8_t *src, const uint8_t *src2, int dstStride, int
src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 ,
dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8,
dstStride, src2Stride);\
@@ -196,10 +185,6 @@ static av_always_inline void ff_ ## OPNAME ##
h264_qpel16_hv_lowpass_ ## MMX(uin
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2
ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2
ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
-#define H264_MC_C_H(OPNAME, SIZE, MMX, ALIGN) \
-H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
-H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
-
#define H264_MC_C_V_H_HV(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
@@ -356,8 +341,7 @@ QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
-H264_MC(H264_MC_C_H, 8, mmxext, 8)
-H264_MC(H264_MC_H, 16, mmxext, 8)
+H264_MC_C(avg_, 8, mmxext, 8)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
@@ -421,20 +405,11 @@ LUMA_MC_816(10, mc33, sse2)
#endif /* HAVE_X86ASM */
-#define SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX) \
+#define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
- } while (0)
-#define SET_QPEL_FUNCS0123(PFX, IDX, SIZE, CPU, PREFIX) \
- do { \
- c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
- SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX); \
- } while (0)
-#define SET_QPEL_FUNCS_1PP(PFX, IDX, SIZE, CPU, PREFIX) \
- do { \
- SET_QPEL_FUNCS123(PFX, IDX, SIZE, CPU, PREFIX); \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
@@ -478,11 +453,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int
bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
- SET_QPEL_FUNCS123 (put_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS123 (put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, );
- SET_QPEL_FUNCS123 (avg_h264_qpel, 0, 16, mmxext, );
- SET_QPEL_FUNCS0123(avg_h264_qpel, 1, 8, mmxext, );
+ c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_mmxext;
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
diff --git a/libavcodec/x86/h264_qpel_8bit.asm
b/libavcodec/x86/h264_qpel_8bit.asm
index 4e64329991..89e7c282b2 100644
--- a/libavcodec/x86/h264_qpel_8bit.asm
+++ b/libavcodec/x86/h264_qpel_8bit.asm
@@ -96,66 +96,6 @@ INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
-%macro QPEL8_H_LOWPASS_OP 1
-cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
- movsxdifnidn r2, r2d
- movsxdifnidn r3, r3d
- mov r4d, 8
- pxor m7, m7
- mova m6, [pw_5]
-.loop:
- mova m0, [r1]
- mova m2, [r1+1]
- mova m1, m0
- mova m3, m2
- punpcklbw m0, m7
- punpckhbw m1, m7
- punpcklbw m2, m7
- punpckhbw m3, m7
- paddw m0, m2
- paddw m1, m3
- psllw m0, 2
- psllw m1, 2
- mova m2, [r1-1]
- mova m4, [r1+2]
- mova m3, m2
- mova m5, m4
- punpcklbw m2, m7
- punpckhbw m3, m7
- punpcklbw m4, m7
- punpckhbw m5, m7
- paddw m2, m4
- paddw m5, m3
- psubw m0, m2
- psubw m1, m5
- pmullw m0, m6
- pmullw m1, m6
- movd m2, [r1-2]
- movd m5, [r1+7]
- punpcklbw m2, m7
- punpcklbw m5, m7
- paddw m2, m3
- paddw m4, m5
- mova m5, [pw_16]
- paddw m2, m5
- paddw m4, m5
- paddw m0, m2
- paddw m1, m4
- psraw m0, 5
- psraw m1, 5
- packuswb m0, m1
- op_%1 m0, [r0], m4
- add r0, r2
- add r1, r3
- dec r4d
- jg .loop
- RET
-%endmacro
-
-INIT_MMX mmxext
-QPEL8_H_LOWPASS_OP put
-QPEL8_H_LOWPASS_OP avg
-
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
commit 4fc05c28f426d6073e6e15db334b0c88ff925f1d
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 13:12:31 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:35 2025 +0200
avfilter/x86/vf_gradfun: Remove MMXEXT func overridden by SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 version
of filter_line.
This commit therefore removes the overridden MMXEXT version
(which didn't abide by the ABI) which allows us to remove
an emms_c() from vf_gradfun.c, so that users with SSSE3 no longer
pay a price for the mere existence of an MMXEXT version.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavfilter/vf_gradfun.c b/libavfilter/vf_gradfun.c
index 088b3c9143..4f211c3ddf 100644
--- a/libavfilter/vf_gradfun.c
+++ b/libavfilter/vf_gradfun.c
@@ -32,7 +32,6 @@
* Dither it back to 8bit.
*/
-#include "libavutil/emms.h"
#include "libavutil/imgutils.h"
#include "libavutil/common.h"
#include "libavutil/mem.h"
@@ -119,7 +118,6 @@ static void filter(GradFunContext *ctx, uint8_t *dst, const
uint8_t *src, int wi
ctx->filter_line(dst + y * dst_linesize, src + y * src_linesize, dc -
r / 2, width, thresh, dither[y & 7]);
if (++y >= height) break;
}
- emms_c();
}
static av_cold int init(AVFilterContext *ctx)
diff --git a/libavfilter/x86/vf_gradfun.asm b/libavfilter/x86/vf_gradfun.asm
index d106d52100..55e7c1ea0f 100644
--- a/libavfilter/x86/vf_gradfun.asm
+++ b/libavfilter/x86/vf_gradfun.asm
@@ -27,7 +27,15 @@ pw_ff: times 8 dw 0xFF
SECTION .text
-%macro FILTER_LINE 1
+INIT_XMM ssse3
+cglobal gradfun_filter_line, 6, 6, 8
+ movd m5, r4d
+ pxor m7, m7
+ pshuflw m5, m5, 0
+ mova m6, [pw_7f]
+ punpcklqdq m5, m5
+ mova m4, [r5]
+.loop:
movh m0, [r2+r0]
movh m1, [r3+r0]
punpcklbw m0, m7
@@ -40,42 +48,12 @@ SECTION .text
pminsw m2, m7
pmullw m2, m2
psllw m1, 2
- paddw m0, %1
+ paddw m0, m4
pmulhw m1, m2
paddw m0, m1
psraw m0, 7
packuswb m0, m0
movh [r1+r0], m0
-%endmacro
-
-INIT_MMX mmxext
-cglobal gradfun_filter_line, 6, 6
- movh m5, r4d
- pxor m7, m7
- pshufw m5, m5,0
- mova m6, [pw_7f]
- mova m3, [r5]
- mova m4, [r5+8]
-.loop:
- FILTER_LINE m3
- add r0, 4
- jge .end
- FILTER_LINE m4
- add r0, 4
- jl .loop
-.end:
- RET
-
-INIT_XMM ssse3
-cglobal gradfun_filter_line, 6, 6, 8
- movd m5, r4d
- pxor m7, m7
- pshuflw m5, m5, 0
- mova m6, [pw_7f]
- punpcklqdq m5, m5
- mova m4, [r5]
-.loop:
- FILTER_LINE m4
add r0, 8
jl .loop
RET
diff --git a/libavfilter/x86/vf_gradfun_init.c
b/libavfilter/x86/vf_gradfun_init.c
index 56e6774a79..f262f0a1bb 100644
--- a/libavfilter/x86/vf_gradfun_init.c
+++ b/libavfilter/x86/vf_gradfun_init.c
@@ -24,9 +24,6 @@
#include "libavutil/x86/cpu.h"
#include "libavfilter/gradfun.h"
-void ff_gradfun_filter_line_mmxext(intptr_t x, uint8_t *dst, const uint8_t
*src,
- const uint16_t *dc, int thresh,
- const uint16_t *dithers);
void ff_gradfun_filter_line_ssse3(intptr_t x, uint8_t *dst, const uint8_t *src,
const uint16_t *dc, int thresh,
const uint16_t *dithers);
@@ -39,23 +36,6 @@ void ff_gradfun_blur_line_movdqu_sse2(intptr_t x, uint16_t
*buf,
const uint8_t *src1, const uint8_t
*src2);
#if HAVE_X86ASM
-static void gradfun_filter_line_mmxext(uint8_t *dst, const uint8_t *src,
- const uint16_t *dc,
- int width, int thresh,
- const uint16_t *dithers)
-{
- intptr_t x;
- if (width & 3) {
- x = width & ~3;
- ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2,
- width - x, thresh, dithers);
- width = x;
- }
- x = -width;
- ff_gradfun_filter_line_mmxext(x, dst + width, src + width, dc + width / 2,
- thresh, dithers);
-}
-
static void gradfun_filter_line_ssse3(uint8_t *dst, const uint8_t *src, const
uint16_t *dc,
int width, int thresh,
const uint16_t *dithers)
@@ -93,8 +73,6 @@ av_cold void ff_gradfun_init_x86(GradFunContext *gf)
#if HAVE_X86ASM
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_MMXEXT(cpu_flags))
- gf->filter_line = gradfun_filter_line_mmxext;
if (EXTERNAL_SSSE3(cpu_flags))
gf->filter_line = gradfun_filter_line_ssse3;
commit 5ef613bcb0508f16bd5b190168183326391de9b0
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 06:22:05 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:31 2025 +0200
avcodec/x86/mpegvideoencdsp_init: Remove MMX, 3DNOw funcs overridden by
SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX and 3DNOW functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.
Also merge the mpegvideoenc_qns_template.c file into the main file.
The 3DNOW functions removed in this commit were the last in the
codebase.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/mpegvideoenc_qns_template.c
b/libavcodec/x86/mpegvideoenc_qns_template.c
deleted file mode 100644
index 0d6454f45f..0000000000
--- a/libavcodec/x86/mpegvideoenc_qns_template.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * QNS functions are compiled 3 times for MMX/3DNOW/SSSE3
- * Copyright (c) 2004 Michael Niedermayer
- *
- * MMX optimization by Michael Niedermayer <[email protected]>
- * 3DNow! and SSSE3 optimization by Zuxy Meng <[email protected]>
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#include <stdint.h>
-
-#include "libavutil/avassert.h"
-#include "libavutil/common.h"
-#include "libavutil/x86/asm.h"
-
-#include "inline_asm.h"
-
-#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
-
-static int DEF(try_8x8basis)(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale)
-{
- x86_reg i=0;
-
- av_assert2(FFABS(scale) < MAX_ABS);
- scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
-
- SET_RND(mm6);
- __asm__ volatile(
- "pxor %%mm7, %%mm7 \n\t"
- "movd %4, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "psraw $6, %%mm0 \n\t"
- "psraw $6, %%mm1 \n\t"
- "pmullw (%3, %0), %%mm0 \n\t"
- "pmullw 8(%3, %0), %%mm1 \n\t"
- "pmaddwd %%mm0, %%mm0 \n\t"
- "pmaddwd %%mm1, %%mm1 \n\t"
- "paddd %%mm1, %%mm0 \n\t"
- "psrld $4, %%mm0 \n\t"
- "paddd %%mm0, %%mm7 \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" //FIXME optimize & bench
- " jb 1b \n\t"
- PHADDD(%%mm7, %%mm6)
- "psrld $2, %%mm7 \n\t"
- "movd %%mm7, %0 \n\t"
-
- : "+r" (i)
- : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
- );
- return i;
-}
-
-static void DEF(add_8x8basis)(int16_t rem[64], const int16_t basis[64], int
scale)
-{
- x86_reg i=0;
-
- if(FFABS(scale) < MAX_ABS){
- scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
- SET_RND(mm6);
- __asm__ volatile(
- "movd %3, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- "punpcklwd %%mm5, %%mm5 \n\t"
- ".p2align 4 \n\t"
- "1: \n\t"
- "movq (%1, %0), %%mm0 \n\t"
- "movq 8(%1, %0), %%mm1 \n\t"
- PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
- "paddw (%2, %0), %%mm0 \n\t"
- "paddw 8(%2, %0), %%mm1 \n\t"
- "movq %%mm0, (%2, %0) \n\t"
- "movq %%mm1, 8(%2, %0) \n\t"
- "add $16, %0 \n\t"
- "cmp $128, %0 \n\t" // FIXME optimize & bench
- " jb 1b \n\t"
-
- : "+r" (i)
- : "r"(basis), "r"(rem), "g"(scale)
- );
- }else{
- for(i=0; i<8*8; i++){
- rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT -
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
- }
- }
-}
diff --git a/libavcodec/x86/mpegvideoencdsp_init.c
b/libavcodec/x86/mpegvideoencdsp_init.c
index d39091a5c9..78c2ef87b8 100644
--- a/libavcodec/x86/mpegvideoencdsp_init.c
+++ b/libavcodec/x86/mpegvideoencdsp_init.c
@@ -16,9 +16,13 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include <stdint.h>
+
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
+#include "libavutil/common.h"
#include "libavutil/cpu.h"
+#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideoencdsp.h"
@@ -28,71 +32,93 @@ int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t
line_size);
int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);
#if HAVE_INLINE_ASM
-
-#define PHADDD(a, t) \
- "movq " #a ", " #t " \n\t" \
- "psrlq $32, " #a " \n\t" \
- "paddd " #t ", " #a " \n\t"
+#if HAVE_SSSE3_INLINE
+#define SCALE_OFFSET -1
/*
- * pmulhw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15])[16 - 31]
- * pmulhrw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x8000)[16 - 31]
* pmulhrsw: dst[0 - 15] = (src[0 - 15] * dst[0 - 15] + 0x4000)[15 - 30]
*/
-#define PMULHRW(x, y, s, o) \
- "pmulhw " #s ", " #x " \n\t" \
- "pmulhw " #s ", " #y " \n\t" \
- "paddw " #o ", " #x " \n\t" \
- "paddw " #o ", " #y " \n\t" \
- "psraw $1, " #x " \n\t" \
- "psraw $1, " #y " \n\t"
-#define DEF(x) x ## _mmx
-#define SET_RND MOVQ_WONE
-#define SCALE_OFFSET 1
-
-#include "mpegvideoenc_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#define DEF(x) x ## _3dnow
-#define SET_RND(x)
-#define SCALE_OFFSET 0
-#define PMULHRW(x, y, s, o) \
- "pmulhrw " #s ", " #x " \n\t" \
- "pmulhrw " #s ", " #y " \n\t"
-
-#include "mpegvideoenc_qns_template.c"
-
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-
-#if HAVE_SSSE3_INLINE
-#undef PHADDD
-#define DEF(x) x ## _ssse3
-#define SET_RND(x)
-#define SCALE_OFFSET -1
-
-#define PHADDD(a, t) \
- "pshufw $0x0E, " #a ", " #t " \n\t" \
- /* faster than phaddd on core2 */ \
- "paddd " #t ", " #a " \n\t"
-
#define PMULHRW(x, y, s, o) \
"pmulhrsw " #s ", " #x " \n\t" \
"pmulhrsw " #s ", " #y " \n\t"
-#include "mpegvideoenc_qns_template.c"
+#define MAX_ABS 512
+
+static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64],
const int16_t basis[64], int scale)
+{
+ x86_reg i=0;
+
+ av_assert2(FFABS(scale) < MAX_ABS);
+ scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+
+ __asm__ volatile(
+ "pxor %%mm7, %%mm7 \n\t"
+ "movd %4, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "psraw $6, %%mm0 \n\t"
+ "psraw $6, %%mm1 \n\t"
+ "pmullw (%3, %0), %%mm0 \n\t"
+ "pmullw 8(%3, %0), %%mm1 \n\t"
+ "pmaddwd %%mm0, %%mm0 \n\t"
+ "pmaddwd %%mm1, %%mm1 \n\t"
+ "paddd %%mm1, %%mm0 \n\t"
+ "psrld $4, %%mm0 \n\t"
+ "paddd %%mm0, %%mm7 \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" //FIXME optimize & bench
+ " jb 1b \n\t"
+ "pshufw $0x0E, %%mm7, %%mm6 \n\t"
+ "paddd %%mm6, %%mm7 \n\t" // faster than phaddd on core2
+ "psrld $2, %%mm7 \n\t"
+ "movd %%mm7, %0 \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
+ );
+ return i;
+}
+
+static void add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int
scale)
+{
+ x86_reg i=0;
+
+ if (FFABS(scale) < MAX_ABS) {
+ scale <<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
+ __asm__ volatile(
+ "movd %3, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ "punpcklwd %%mm5, %%mm5 \n\t"
+ ".p2align 4 \n\t"
+ "1: \n\t"
+ "movq (%1, %0), %%mm0 \n\t"
+ "movq 8(%1, %0), %%mm1 \n\t"
+ PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
+ "paddw (%2, %0), %%mm0 \n\t"
+ "paddw 8(%2, %0), %%mm1 \n\t"
+ "movq %%mm0, (%2, %0) \n\t"
+ "movq %%mm1, 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ "cmp $128, %0 \n\t" // FIXME optimize & bench
+ " jb 1b \n\t"
+
+ : "+r" (i)
+ : "r"(basis), "r"(rem), "g"(scale)
+ );
+ } else {
+ for (i=0; i<8*8; i++) {
+ rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT -
RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
+ }
+ }
+}
-#undef DEF
-#undef SET_RND
-#undef SCALE_OFFSET
-#undef PMULHRW
-#undef PHADDD
#endif /* HAVE_SSSE3_INLINE */
/* Draw the edges of width 'w' of an image of size width, height */
@@ -197,23 +223,11 @@ av_cold void
ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
- if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
- c->try_8x8basis = try_8x8basis_mmx;
- }
- c->add_8x8basis = add_8x8basis_mmx;
-
if (avctx->bits_per_raw_sample <= 8) {
c->draw_edges = draw_edges_mmx;
}
}
- if (INLINE_AMD3DNOW(cpu_flags)) {
- if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
- c->try_8x8basis = try_8x8basis_3dnow;
- }
- c->add_8x8basis = add_8x8basis_3dnow;
- }
-
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags)) {
if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
commit 6a47ea5f9fdaedd6aa4bc8723c86a0c7a30d8ed1
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 16:17:53 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:26 2025 +0200
avcodec/x86/vvc/sao_10bit: Remove unused functions
Saves 65280B here.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/vvc/sao_10bit.asm b/libavcodec/x86/vvc/sao_10bit.asm
index b7d3d08008..ccf14a34a4 100644
--- a/libavcodec/x86/vvc/sao_10bit.asm
+++ b/libavcodec/x86/vvc/sao_10bit.asm
@@ -28,28 +28,6 @@
H2656_SAO_BAND_FILTER vvc, %1, %2, %3
%endmacro
-%macro VVC_SAO_BAND_FILTER_FUNCS 1
- VVC_SAO_BAND_FILTER %1, 8, 1
- VVC_SAO_BAND_FILTER %1, 16, 2
- VVC_SAO_BAND_FILTER %1, 32, 4
- VVC_SAO_BAND_FILTER %1, 48, 6
- VVC_SAO_BAND_FILTER %1, 64, 8
- VVC_SAO_BAND_FILTER %1, 80, 10
- VVC_SAO_BAND_FILTER %1, 96, 12
- VVC_SAO_BAND_FILTER %1, 112, 14
- VVC_SAO_BAND_FILTER %1, 128, 16
-%endmacro
-
-%macro VVC_SAO_BAND_FILTER_FUNCS 0
- VVC_SAO_BAND_FILTER_FUNCS 10
- VVC_SAO_BAND_FILTER_FUNCS 12
-%endmacro
-
-INIT_XMM sse2
-VVC_SAO_BAND_FILTER_FUNCS
-INIT_XMM avx
-VVC_SAO_BAND_FILTER_FUNCS
-
%if HAVE_AVX2_EXTERNAL
%macro VVC_SAO_BAND_FILTER_FUNCS_AVX2 1
@@ -75,22 +53,6 @@ VVC_SAO_BAND_FILTER_FUNCS_AVX2 12
H2656_SAO_EDGE_FILTER vvc, %1, %2, %3
%endmacro
-%macro VVC_SAO_EDGE_FILTER_FUNCS 1
- VVC_SAO_EDGE_FILTER %1, 8, 1
- VVC_SAO_EDGE_FILTER %1, 16, 2
- VVC_SAO_EDGE_FILTER %1, 32, 4
- VVC_SAO_EDGE_FILTER %1, 48, 6
- VVC_SAO_EDGE_FILTER %1, 64, 8
- VVC_SAO_EDGE_FILTER %1, 80, 10
- VVC_SAO_EDGE_FILTER %1, 96, 12
- VVC_SAO_EDGE_FILTER %1, 112, 14
- VVC_SAO_EDGE_FILTER %1, 128, 16
-%endmacro
-
-INIT_XMM sse2
-VVC_SAO_EDGE_FILTER_FUNCS 10
-VVC_SAO_EDGE_FILTER_FUNCS 12
-
%if HAVE_AVX2_EXTERNAL
%macro VVC_SAO_EDGE_FILTER_FUNCS_AVX2 1
commit 918d37d9d156f15b63952a22bfba0541dd087129
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 05:55:07 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:23 2025 +0200
avcodec/x86/rv40dsp_init: Remove MMX(EXT) funcs overridden by SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 9d8b58f929..859894856d 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -372,11 +372,7 @@ AVG_PIXELS8_Y2
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro SET_PIXELS_XY2 1
-%if cpuflag(sse2)
cglobal %1_pixels16_xy2, 4,5,8
-%else
-cglobal %1_pixels8_xy2, 4,5
-%endif
pxor m7, m7
mova m6, [pw_2]
movu m0, [r1]
@@ -448,8 +444,6 @@ cglobal %1_pixels8_xy2, 4,5
RET
%endmacro
-INIT_MMX mmxext
-SET_PIXELS_XY2 avg
INIT_XMM sse2
SET_PIXELS_XY2 put
SET_PIXELS_XY2 avg
diff --git a/libavcodec/x86/hpeldsp.h b/libavcodec/x86/hpeldsp.h
index ac7e625fda..8208e43ac1 100644
--- a/libavcodec/x86/hpeldsp.h
+++ b/libavcodec/x86/hpeldsp.h
@@ -25,22 +25,14 @@
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_put_pixels8_xy2_ssse3(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_sse2(uint8_t *block, const uint8_t *pixels,
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 7ee2db1358..ab32b825c9 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -301,20 +301,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx,
put_no_rnd_pixels8_y2_mmx, 8)
CALL_2X_PIXELS(avg_no_rnd_pixels16_xy2_mmx, avg_no_rnd_pixels8_xy2_mmx, 8)
CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8)
#endif
-
-/***********************************/
-/* MMX rounding */
-
-#define SET_RND MOVQ_WTWO
-#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
-#define STATIC
-
-#include "rnd_template.c"
-
-#undef NO_AVG
-#undef DEF
-#undef SET_RND
-
#endif /* HAVE_INLINE_ASM */
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index ab9e644c60..780358abc2 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -174,34 +174,22 @@ DEFINE_FN(put, 8, ssse3)
DEFINE_FN(put, 16, sse2)
DEFINE_FN(put, 16, ssse3)
-DEFINE_FN(avg, 8, mmxext)
DEFINE_FN(avg, 8, ssse3)
DEFINE_FN(avg, 16, sse2)
DEFINE_FN(avg, 16, ssse3)
#endif /* HAVE_X86ASM */
-#if HAVE_MMX_INLINE
-DEFINE_FN(put, 8, mmx)
-#endif
-
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
{
av_unused int cpu_flags = av_get_cpu_flags();
-#if HAVE_MMX_INLINE
- if (INLINE_MMX(cpu_flags)) {
- c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
- }
-#endif /* HAVE_MMX_INLINE */
-
#if HAVE_X86ASM
if (EXTERNAL_MMX(cpu_flags)) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
- c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmxext;
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
}
commit e86f137514fb8a69cf145f26c83b1b053c727b52
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 05:28:17 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:19 2025 +0200
avcodec/x86/hpeldsp_init: Remove MMX(EXT) funcs overridden by SSSE3
SSSE3 is already quite old (introduced 2006 for Intel, 2011 for AMD),
so that the overwhelming majority of our users (particularly those
that actually update their FFmpeg) will be using the SSSE3 versions.
This commit therefore removes the MMX(EXT) functions overridden
by them (which don't abide by the ABI) to get closer to a removal
of emms_c.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index b59195de95..9d8b58f929 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -370,46 +370,6 @@ INIT_XMM sse2
AVG_PIXELS8_Y2
-; void ff_avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
-; Note this is not correctly rounded, and is therefore used for
-; not-bitexact output
-INIT_MMX mmxext
-cglobal avg_approx_pixels8_xy2, 4,5
- mova m6, [pb_1]
- lea r4, [r2*2]
- mova m0, [r1]
- PAVGB m0, [r1+1]
-.loop:
- mova m2, [r1+r4]
- mova m1, [r1+r2]
- psubusb m2, m6
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+r4+1]
- add r1, r4
- PAVGB m0, m1
- PAVGB m1, m2
- PAVGB m0, [r0]
- PAVGB m1, [r0+r2]
- mova [r0], m0
- mova [r0+r2], m1
- mova m1, [r1+r2]
- mova m0, [r1+r4]
- PAVGB m1, [r1+r2+1]
- PAVGB m0, [r1+r4+1]
- add r0, r4
- add r1, r4
- PAVGB m2, m1
- PAVGB m1, m0
- PAVGB m2, [r0]
- PAVGB m1, [r0+r2]
- mova [r0], m2
- mova [r0+r2], m1
- add r0, r4
- sub r3d, 4
- jne .loop
- RET
-
-
; void ff_avg_pixels16_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
%macro SET_PIXELS_XY2 1
%if cpuflag(sse2)
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c0913552d5..7ee2db1358 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -60,11 +60,7 @@ void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t
*pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
-#define put_pixels8_mmx ff_put_pixels8_mmx
-#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
#if HAVE_INLINE_ASM
@@ -354,7 +350,9 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx);
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
- SET_HPEL_FUNCS03(put, [1], 8, mmx);
+#if HAVE_MMX_EXTERNAL
+ c->put_pixels_tab[1][0] = ff_put_pixels8_mmx;
+#endif
SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
#endif
}
@@ -368,7 +366,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int
flags)
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
- c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
@@ -378,8 +375,6 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int
flags)
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
-
- c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
commit 2cf9e733c6a666600423a0967f23341d9f09e3c8
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 02:08:03 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:16 2025 +0200
avcodec/x86/qpeldsp_init: Use SSE2 versions where possible
The mc00 versions (i.e. the qdsp functions with no subpixel
interpolation) are just wrappers around their fpel versions.
There are SSE2 versions of these, yet the qpel code only
uses the MMX(EXT) versions. This commit changes this and
also removes the MMX(EXT) versions.
This also allowed to remove ff_avg_pixels16_mmxext,
ff_put_pixels16_mmx.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/fpel.asm b/libavcodec/x86/fpel.asm
index b07b789074..8551ff1ff3 100644
--- a/libavcodec/x86/fpel.asm
+++ b/libavcodec/x86/fpel.asm
@@ -67,12 +67,10 @@ cglobal %1_pixels%2, 4,5,4
INIT_MMX mmx
OP_PIXELS put, 8
-OP_PIXELS put, 16
INIT_MMX mmxext
OP_PIXELS avg, 4
OP_PIXELS avg, 8
-OP_PIXELS avg, 16
INIT_XMM sse2
OP_PIXELS put, 16
diff --git a/libavcodec/x86/fpel.h b/libavcodec/x86/fpel.h
index 47ffc8eec7..851a70b99f 100644
--- a/libavcodec/x86/fpel.h
+++ b/libavcodec/x86/fpel.h
@@ -26,14 +26,10 @@ void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t
*pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
diff --git a/libavcodec/x86/qpeldsp_init.c b/libavcodec/x86/qpeldsp_init.c
index 3b05e156cc..097cda0106 100644
--- a/libavcodec/x86/qpeldsp_init.c
+++ b/libavcodec/x86/qpeldsp_init.c
@@ -79,22 +79,10 @@ void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst,
const uint8_t *src,
int dstStride, int srcStride);
-#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
-#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
#if HAVE_X86ASM
-#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
-#define ff_put_pixels8_mmxext ff_put_pixels8_mmx
-
#define QPEL_OP(OPNAME, RND, MMX) \
-static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t stride) \
-{ \
- ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
-} \
- \
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
@@ -291,13 +279,6 @@ static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst,
\
stride, 8); \
} \
\
-static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst, \
- const uint8_t *src, \
- ptrdiff_t stride) \
-{ \
- ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
-} \
- \
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride) \
@@ -504,11 +485,23 @@ QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
+#define MC00(OPNAME, SIZE, EXT) \
+static void OPNAME ## _qpel ## SIZE ## _mc00_ ## EXT(uint8_t *dst, \
+ const uint8_t *src,\
+ ptrdiff_t stride) \
+{ \
+ ff_ ## OPNAME ## _pixels ## SIZE ##_ ## EXT(dst, src, stride, SIZE);\
+}
+
+MC00(put, 8, mmx)
+MC00(avg, 8, mmxext)
+MC00(put, 16, sse2)
+MC00(avg, 16, sse2)
+
#endif /* HAVE_X86ASM */
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
- c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
@@ -533,12 +526,20 @@ av_cold void ff_qpeldsp_init_x86(QpelDSPContext *c)
if (X86_MMXEXT(cpu_flags)) {
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
+ c->avg_qpel_pixels_tab[1][0] = avg_qpel8_mc00_mmxext;
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
+ c->put_no_rnd_qpel_pixels_tab[1][0] =
+ c->put_qpel_pixels_tab[1][0] = put_qpel8_mc00_mmx;
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
#endif /* HAVE_MMXEXT_EXTERNAL */
}
+ if (EXTERNAL_SSE2(cpu_flags)) {
+ c->put_no_rnd_qpel_pixels_tab[0][0] =
+ c->put_qpel_pixels_tab[0][0] = put_qpel16_mc00_sse2;
+ c->avg_qpel_pixels_tab[0][0] = avg_qpel16_mc00_sse2;
+ }
}
commit 1f9ef6a8dc6e57b360cf53dd644fde1936ad3047
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 01:18:54 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:12 2025 +0200
avcodec/x86/h264_qpel: Remove MMX(EXT) functions overridden by SSE2FAST
CPUs which support SSE2, but not in a fast way (so that
they get the additional AV_CPU_FLAG_SSE2SLOW) are ancient
nowadays (2007 and older), so ignore the distinction between
the two and remove MMX and MMXEXT functions that are now
overridden by SSE2 functions.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/h264_qpel.c b/libavcodec/x86/h264_qpel.c
index d69ccda89c..69ffd001e0 100644
--- a/libavcodec/x86/h264_qpel.c
+++ b/libavcodec/x86/h264_qpel.c
@@ -46,7 +46,6 @@ void ff_avg_pixels16_l2_mmxext(uint8_t *dst, const uint8_t
*src1, const uint8_t
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
-#define ff_put_pixels16_mmxext ff_put_pixels16_mmx
#define ff_put_pixels8_mmxext(...)
#define ff_put_pixels4_mmxext(...)
@@ -217,7 +216,6 @@ static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, const
uint8_t *src,
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
-#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void av_unused OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t
*dst, const uint8_t *src, ptrdiff_t stride)\
@@ -359,7 +357,7 @@ QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
H264_MC(H264_MC_C_V_H_HV, 4, mmxext, 8)
H264_MC(H264_MC_C_H, 8, mmxext, 8)
-H264_MC(H264_MC_C_H, 16, mmxext, 8)
+H264_MC(H264_MC_H, 16, mmxext, 8)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
@@ -480,10 +478,10 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int
bit_depth)
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
- SET_QPEL_FUNCS0123(put_h264_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS123 (put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS123 (put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS_1PP(put_h264_qpel, 2, 4, mmxext, );
- SET_QPEL_FUNCS0123(avg_h264_qpel, 0, 16, mmxext, );
+ SET_QPEL_FUNCS123 (avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS0123(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
@@ -506,6 +504,8 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int
bit_depth)
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
+ c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2;
+ c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2;
}
if (bit_depth == 10) {
@@ -519,14 +519,6 @@ av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int
bit_depth)
}
}
- if (EXTERNAL_SSE2_FAST(cpu_flags)) {
- if (!high_bit_depth) {
- c->put_h264_qpel_pixels_tab[0][0] = put_h264_qpel16_mc00_sse2;
- c->avg_h264_qpel_pixels_tab[0][0] = avg_h264_qpel16_mc00_sse2;
- c->avg_h264_qpel_pixels_tab[1][0] = avg_h264_qpel8_mc00_sse2;
- }
- }
-
if (EXTERNAL_SSSE3(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
commit 8a7858dacf50797c7b81aad119e8811a849d0552
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 00:26:32 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:08 2025 +0200
avcodec/x86/hpeldsp_init: Remove MMX(EXT) functions overridden by SSE2FAST
CPUs which support SSE2, but not in a fast way (so that
they get the additional AV_CPU_FLAG_SSE2SLOW) are ancient
nowadays (2007 and older), so ignore the distinction between
the two and remove MMX and MMXEXT functions that are now
overridden by SSE2 functions.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index 3bc278618c..b59195de95 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -84,47 +84,7 @@ cglobal put_pixels8_x2, 4,5
INIT_MMX mmxext
PUT_PIXELS8_X2
-
; void ff_put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t
line_size, int h)
-%macro PUT_PIXELS_16 0
-cglobal put_pixels16_x2, 4,5
- lea r4, [r2*2]
-.loop:
- mova m0, [r1]
- mova m1, [r1+r2]
- mova m2, [r1+8]
- mova m3, [r1+r2+8]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+9]
- PAVGB m3, [r1+r2+9]
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+8], m2
- mova [r0+r2+8], m3
- add r1, r4
- add r0, r4
- mova m0, [r1]
- mova m1, [r1+r2]
- mova m2, [r1+8]
- mova m3, [r1+r2+8]
- PAVGB m0, [r1+1]
- PAVGB m1, [r1+r2+1]
- PAVGB m2, [r1+9]
- PAVGB m3, [r1+r2+9]
- add r1, r4
- mova [r0], m0
- mova [r0+r2], m1
- mova [r0+8], m2
- mova [r0+r2+8], m3
- add r0, r4
- sub r3d, 4
- jne .loop
- RET
-%endmacro
-
-INIT_MMX mmxext
-PUT_PIXELS_16
; The 8_X2 macro can easily be used here
INIT_XMM sse2
PUT_PIXELS8_X2
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index c190e7b473..c0913552d5 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -36,8 +36,6 @@
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
-void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
- ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_x2_sse2(uint8_t *block, const uint8_t *pixels,
@@ -66,10 +64,8 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const
uint8_t *pixels,
ptrdiff_t line_size, int h);
#define put_pixels8_mmx ff_put_pixels8_mmx
-#define put_pixels16_mmx ff_put_pixels16_mmx
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
-#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
#if HAVE_INLINE_ASM
@@ -323,10 +319,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx,
put_no_rnd_pixels8_xy2_mmx, 8)
#undef DEF
#undef SET_RND
-#if HAVE_MMX
-CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8)
-#endif
-
#endif /* HAVE_INLINE_ASM */
@@ -334,12 +326,7 @@ CALL_2X_PIXELS(put_pixels16_xy2_mmx,
ff_put_pixels8_xy2_mmx, 8)
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2
## CPUEXT, 8) \
- CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2
## CPUEXT, 8) \
- CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2
## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2
## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2
## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2
## CPUEXT, 8) \
- CALL_2X_PIXELS(avg_approx_pixels16_xy2## CPUEXT,
ff_avg_approx_pixels8_xy2## CPUEXT, 8)
+ CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2
## CPUEXT, 8)
HPELDSP_AVG_PIXELS16(_mmxext)
@@ -359,17 +346,12 @@ HPELDSP_AVG_PIXELS16(_mmxext)
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ##
CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ##
CPU; \
} while (0)
-#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU)
\
- do {
\
- SET_HPEL_FUNCS03(PFX, IDX, SIZE, CPU);
\
- SET_HPEL_FUNCS12(PFX, IDX, SIZE, CPU);
\
- } while (0)
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
{
#if HAVE_MMX_INLINE
- SET_HPEL_FUNCS03(put, [0], 16, mmx);
- SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
+ SET_HPEL_FUNCS12(put_no_rnd, [0], 16, mmx);
+ c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx);
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
SET_HPEL_FUNCS03(put, [1], 8, mmx);
@@ -380,14 +362,6 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags)
{
#if HAVE_MMXEXT_EXTERNAL
- c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
- c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
-
- c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
- c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
- c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
- c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
@@ -399,21 +373,18 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int
flags)
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
- c->avg_no_rnd_pixels_tab[0] = ff_avg_pixels16_mmxext;
-
if (!(flags & AV_CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
- c->avg_pixels_tab[0][3] = avg_approx_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_approx_pixels8_xy2_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
-static void hpeldsp_init_sse2_fast(HpelDSPContext *c, int flags)
+static void hpeldsp_init_sse2(HpelDSPContext *c, int flags)
{
#if HAVE_SSE2_EXTERNAL
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
@@ -449,8 +420,8 @@ av_cold void ff_hpeldsp_init_x86(HpelDSPContext *c, int
flags)
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags);
- if (EXTERNAL_SSE2_FAST(cpu_flags))
- hpeldsp_init_sse2_fast(c, flags);
+ if (EXTERNAL_SSE2(cpu_flags))
+ hpeldsp_init_sse2(c, flags);
if (EXTERNAL_SSSE3(cpu_flags))
hpeldsp_init_ssse3(c, flags);
commit 4d691da5edb360fa043df8ce267a382cfcdaf07a
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 05:24:49 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:04 2025 +0200
avcodec/x86/hpeldsp_init: Remove MMX functions overridden by MMXEXT
Forgotten in a51279bbdea0d6db920d71980262bccd0ce78226 because
I only looked for MMX(EXT) functions overridden by SSE2.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 6b2ad4494b..c190e7b473 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -161,38 +161,6 @@ static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block,
const uint8_t *pixels,
:FF_REG_a, "memory");
}
-static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
-{
- MOVQ_BFE(mm6);
- __asm__ volatile(
- "lea (%3, %3), %%"FF_REG_a" \n\t"
- ".p2align 3 \n\t"
- "1: \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "movq (%1), %%mm0 \n\t"
- "movq 1(%1), %%mm1 \n\t"
- "movq (%1, %3), %%mm2 \n\t"
- "movq 1(%1, %3), %%mm3 \n\t"
- PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
- "movq %%mm4, (%2) \n\t"
- "movq %%mm5, (%2, %3) \n\t"
- "add %%"FF_REG_a", %1 \n\t"
- "add %%"FF_REG_a", %2 \n\t"
- "subl $4, %0 \n\t"
- "jnz 1b \n\t"
- :"+g"(h), "+S"(pixels), "+D"(block)
- :"r"((x86_reg)line_size)
- :FF_REG_a, "memory");
-}
-
static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
@@ -405,7 +373,7 @@ static void hpeldsp_init_mmx(HpelDSPContext *c, int flags)
SET_HPEL_FUNCS12(avg_no_rnd, , 16, mmx);
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_mmx;
SET_HPEL_FUNCS03(put, [1], 8, mmx);
- SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
+ SET_HPEL_FUNCS03(put_no_rnd, [1], 8, mmx);
#endif
}
commit 4e2ef29cbaa258cb73f06e62435198736e493a10
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Mon Sep 22 03:43:20 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:21:02 2025 +0200
tests/checkasm: Add hpeldsp checkasm
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 0a54adc96a..c41d719e82 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -12,6 +12,7 @@ AVCODECOBJS-$(CONFIG_H264CHROMA) += h264chroma.o
AVCODECOBJS-$(CONFIG_H264DSP) += h264dsp.o
AVCODECOBJS-$(CONFIG_H264PRED) += h264pred.o
AVCODECOBJS-$(CONFIG_H264QPEL) += h264qpel.o
+AVCODECOBJS-$(CONFIG_HPELDSP) += hpeldsp.o
AVCODECOBJS-$(CONFIG_IDCTDSP) += idctdsp.o
AVCODECOBJS-$(CONFIG_LLAUDDSP) += llauddsp.o
AVCODECOBJS-$(CONFIG_LLVIDDSP) += llviddsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index ad4d9b53b6..b23e4ce889 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -184,6 +184,9 @@ static const struct {
{ "hevc_pel", checkasm_check_hevc_pel },
{ "hevc_sao", checkasm_check_hevc_sao },
#endif
+ #if CONFIG_HPELDSP
+ { "hpeldsp", checkasm_check_hpeldsp },
+ #endif
#if CONFIG_HUFFYUV_DECODER
{ "huffyuvdsp", checkasm_check_huffyuvdsp },
#endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 1684c427d6..0f02c4fb6d 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -110,6 +110,7 @@ void checkasm_check_hevc_deblock(void);
void checkasm_check_hevc_idct(void);
void checkasm_check_hevc_pel(void);
void checkasm_check_hevc_sao(void);
+void checkasm_check_hpeldsp(void);
void checkasm_check_huffyuvdsp(void);
void checkasm_check_idctdsp(void);
void checkasm_check_idet(void);
diff --git a/tests/checkasm/hpeldsp.c b/tests/checkasm/hpeldsp.c
new file mode 100644
index 0000000000..ba290b3ab8
--- /dev/null
+++ b/tests/checkasm/hpeldsp.c
@@ -0,0 +1,115 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "checkasm.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/macros.h"
+#include "libavutil/mem_internal.h"
+#include "libavcodec/avcodec.h"
+#include "libavcodec/hpeldsp.h"
+
+#define MAX_BLOCK_SIZE 16
+#define MAX_HEIGHT 16
+#define MAX_STRIDE 64
+// BUF_SIZE is bigger than necessary in order to test strides > block width.
+#define BUF_SIZE ((MAX_HEIGHT - 1) * MAX_STRIDE + MAX_BLOCK_SIZE)
+// Due to hpel interpolation the input needs to have one more line than
+// the output and the last line needs one more element.
+// The input is not subject to alignment requirements; making the input buffer
+// bigger (by MAX_BLOCK_SIZE - 1) allows us to use a random misalignment.
+#define INPUT_BUF_SIZE (MAX_HEIGHT * MAX_STRIDE + MAX_BLOCK_SIZE + 1 +
(MAX_BLOCK_SIZE - 1))
+
+#define randomize_buffers(buf0, buf1) \
+ do { \
+ static_assert(sizeof(buf0) == sizeof(buf1), "Incompatible buffers"); \
+ static_assert(!(sizeof(buf0) % 4), "Tail handling needed"); \
+ static_assert(sizeof(buf0[0]) == 1 && sizeof(buf1[0]) == 1, \
+ "Pointer arithmetic needs to be adapted"); \
+ for (size_t k = 0; k < sizeof(buf0); k += 4) { \
+ uint32_t r = rnd(); \
+ AV_WN32A(buf0 + k, r); \
+ AV_WN32A(buf1 + k, r); \
+ } \
+ } while (0)
+
+
+void checkasm_check_hpeldsp(void)
+{
+ DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf0)[INPUT_BUF_SIZE];
+ DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, srcbuf1)[INPUT_BUF_SIZE];
+ DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf0)[BUF_SIZE];
+ DECLARE_ALIGNED(MAX_BLOCK_SIZE, uint8_t, dstbuf1)[BUF_SIZE];
+ HpelDSPContext hdsp;
+ static const struct {
+ const char *name;
+ size_t offset;
+ unsigned nb_blocksizes;
+ } tests[] = {
+#define TEST(NAME, NB) { .name = #NAME, .offset = offsetof(HpelDSPContext,
NAME), .nb_blocksizes = NB }
+ TEST(put_pixels_tab, 4),
+ TEST(avg_pixels_tab, 4),
+ TEST(put_no_rnd_pixels_tab, 2), // put_no_rnd_pixels_tab only has two
usable blocksizes
+ TEST(avg_no_rnd_pixels_tab, 1),
+ };
+ declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t
*dst, const uint8_t *src, ptrdiff_t stride, int h);
+
+ ff_hpeldsp_init(&hdsp, AV_CODEC_FLAG_BITEXACT);
+
+ for (size_t i = 0; i < FF_ARRAY_ELEMS(tests); ++i) {
+ op_pixels_func (*func_tab)[4] = (op_pixels_func (*)[4])((char*)&hdsp +
tests[i].offset);
+ for (unsigned j = 0; j < tests[i].nb_blocksizes; ++j) {
+ const unsigned blocksize = MAX_BLOCK_SIZE >> j;
+ // h must always be a multiple of four, except when width is two
or four.
+ const unsigned h_mult = blocksize <= 4 ? 2 : 4;
+
+ for (unsigned dxy = 0; dxy < 4; ++dxy) {
+ if (check_func(func_tab[j][dxy], "%s[%u][%u]", tests[i].name,
j, dxy)) {
+ // Don't always use output that is 16-aligned.
+ size_t dst_offset = (rnd() % (MAX_BLOCK_SIZE / blocksize))
* blocksize;
+ size_t src_offset = rnd() % MAX_BLOCK_SIZE;
+ ptrdiff_t stride = (rnd() % (MAX_STRIDE / blocksize) + 1)
* blocksize;
+ int h = (rnd() % (MAX_HEIGHT / h_mult) + 1) * h_mult;
+ const uint8_t *src0 = srcbuf0 + src_offset, *src1 =
srcbuf1 + src_offset;
+ uint8_t *dst0 = dstbuf0 + dst_offset, *dst1 = dstbuf1 +
dst_offset;
+
+ if (rnd() & 1) {
+ // Flip stride.
+ dst1 += (h - 1) * stride;
+ dst0 += (h - 1) * stride;
+ // Due to interpolation potentially h + 1 lines are
read
+ // from src, hence h * stride.
+ src0 += h * stride;
+ src1 += h * stride;
+ stride = -stride;
+ }
+
+ randomize_buffers(srcbuf0, srcbuf1);
+ randomize_buffers(dstbuf0, dstbuf1);
+ call_ref(dst0, src0, stride, h);
+ call_new(dst1, src1, stride, h);
+ if (memcmp(srcbuf0, srcbuf1, sizeof(srcbuf0)) ||
memcmp(dstbuf0, dstbuf1, sizeof(dstbuf0)))
+ fail();
+ bench_new(dst0, src0, stride, h);
+ }
+ }
+ }
+ }
+}
diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
index 56476d254c..7570c89ad9 100644
--- a/tests/fate/checkasm.mak
+++ b/tests/fate/checkasm.mak
@@ -27,6 +27,7 @@ FATE_CHECKASM = fate-checkasm-aacencdsp
\
fate-checkasm-hevc_idct \
fate-checkasm-hevc_pel \
fate-checkasm-hevc_sao \
+ fate-checkasm-hpeldsp \
fate-checkasm-huffyuvdsp \
fate-checkasm-idctdsp \
fate-checkasm-jpeg2000dsp \
commit fcb9e0b5f019ec46dffb6d769793ccb7d884fb14
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 06:11:43 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:56 2025 +0200
avcodec/hpel{dsp,_template}: Use ptrdiff_t for strides
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index 77ebcd74a2..67bee665a9 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -29,9 +29,9 @@
static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst, \
const uint8_t *src1, \
const uint8_t *src2, \
- int dst_stride, \
- int src_stride1, \
- int src_stride2, \
+ ptrdiff_t dst_stride, \
+ ptrdiff_t src_stride1, \
+ ptrdiff_t src_stride2, \
int h) \
{ \
int i; \
@@ -50,9 +50,9 @@ static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst,
\
static inline void FUNC(OPNAME ## _pixels4_l2)(uint8_t *dst, \
const uint8_t *src1, \
const uint8_t *src2, \
- int dst_stride, \
- int src_stride1, \
- int src_stride2, \
+ ptrdiff_t dst_stride, \
+ ptrdiff_t src_stride1, \
+ ptrdiff_t src_stride2, \
int h) \
{ \
int i; \
@@ -67,9 +67,9 @@ static inline void FUNC(OPNAME ## _pixels4_l2)(uint8_t *dst,
\
static inline void FUNC(OPNAME ## _pixels2_l2)(uint8_t *dst, \
const uint8_t *src1, \
const uint8_t *src2, \
- int dst_stride, \
- int src_stride1, \
- int src_stride2, \
+ ptrdiff_t dst_stride, \
+ ptrdiff_t src_stride1, \
+ ptrdiff_t src_stride2, \
int h) \
{ \
int i; \
@@ -84,9 +84,9 @@ static inline void FUNC(OPNAME ## _pixels2_l2)(uint8_t *dst,
\
static inline void FUNC(OPNAME ## _pixels16_l2)(uint8_t *dst, \
const uint8_t *src1, \
const uint8_t *src2, \
- int dst_stride, \
- int src_stride1, \
- int src_stride2, \
+ ptrdiff_t dst_stride, \
+ ptrdiff_t src_stride1, \
+ ptrdiff_t src_stride2, \
int h) \
{ \
FUNC(OPNAME ## _pixels8_l2)(dst, src1, src2, dst_stride, \
diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c
index db0e02ee93..688939ad3f 100644
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -39,9 +39,9 @@
static inline void OPNAME ## _no_rnd_pixels8_l2_8(uint8_t *dst, \
const uint8_t *src1, \
const uint8_t *src2, \
- int dst_stride, \
- int src_stride1, \
- int src_stride2, \
+ ptrdiff_t dst_stride, \
+ ptrdiff_t src_stride1,\
+ ptrdiff_t src_stride2,\
int h) \
{ \
int i; \
commit 89f2016ece77868cc1982ae104d56b25aaf519c3
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Tue Sep 23 05:34:37 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:52 2025 +0200
avcodec/hpel_template: Fix unintentional usage of unsigned offsets
The value of sizeof() is of type size_t which means that
an expression like
src1[i * src_stride1 + 4 * (int)sizeof(pixel)]
will use a very large offset if src_stride1 is sufficiently negative.
It works in practice (because it is correct modulo SIZE_MAX),
but UBSan treats it as error:
libavcodec/hpel_template.c:104:1: runtime error: addition of unsigned
offset to 0x7ffdfa0391d8 overflowed to 0x7ffdfa0391cc
Fix this by casting sizeof(pixel) to int.
(This has been uncovered by a checkasm test for the hpeldsp
which will be added in a later commit.)
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/hpel_template.c b/libavcodec/hpel_template.c
index fccfe7610f..77ebcd74a2 100644
--- a/libavcodec/hpel_template.c
+++ b/libavcodec/hpel_template.c
@@ -40,9 +40,9 @@ static inline void FUNC(OPNAME ## _pixels8_l2)(uint8_t *dst,
\
a = AV_RN4P(&src1[i * src_stride1]); \
b = AV_RN4P(&src2[i * src_stride2]); \
OP(*((pixel4 *) &dst[i * dst_stride]), rnd_avg_pixel4(a, b)); \
- a = AV_RN4P(&src1[i * src_stride1 + 4 * sizeof(pixel)]); \
- b = AV_RN4P(&src2[i * src_stride2 + 4 * sizeof(pixel)]); \
- OP(*((pixel4 *) &dst[i * dst_stride + 4 * sizeof(pixel)]), \
+ a = AV_RN4P(&src1[i * src_stride1 + 4 * (int)sizeof(pixel)]); \
+ b = AV_RN4P(&src2[i * src_stride2 + 4 * (int)sizeof(pixel)]); \
+ OP(*((pixel4 *) &dst[i * dst_stride + 4 * (int)sizeof(pixel)]), \
rnd_avg_pixel4(a, b)); \
} \
} \
commit b316a1bdd122ca1bcb43b20dbd6bc9c244f98cfe
Author: Andreas Rheinhardt <[email protected]>
AuthorDate: Sun Sep 21 22:51:18 2025 +0200
Commit: Andreas Rheinhardt <[email protected]>
CommitDate: Fri Sep 26 06:20:30 2025 +0200
avcodec/hpeldsp: Fix documentation
This commit fixes two issues in the documentation:
a) The documentation for {put,avg}_pixels_tab only mentions
widths 16 and 8, although it explicitly mentions that there
are four horizontal blocksizes. This part of the patch
basically reverts e5771f4f37b67951485205e110f4da5e7e32ea74.
b) The restrictions on height don't match the reality. While
most users abide by it, some do not:
i) vp56.c copies a 16x12 block.
ii) indeo3 can copy an arbitrary multiple of four lines
for block widths 4, 8 and 16.
iii) SVQ3 can use block sizes luma block sizes 16x16, 8x16,
16x8, 8x8, 4x8, 8x4 and 4x4 and the corresponding
8x8, 4x8, 8x4, 4x4, 2x4, 4x2 and 2x2 chroma block sizes.
This implies that for widths 2 and 4 height can be two
and is guaranteed to be at least even. For all other widths,
height can be a multiple of four.
Furthermore, a comment for the SVQ3 blocksizes has been added.
Reviewed-by: Lynne <[email protected]>
Signed-off-by: Andreas Rheinhardt <[email protected]>
diff --git a/libavcodec/hpeldsp.h b/libavcodec/hpeldsp.h
index 41a46f0760..1f6a165bf6 100644
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -31,11 +31,12 @@
#include <stdint.h>
#include <stddef.h>
-/* add and put pixel (decoding) */
-// blocksizes for hpel_pixels_func are 8x4,8x8 16x8 16x16
-// h for hpel_pixels_func is limited to {width/2, width} but never larger
-// than 16 and never smaller than 4
-typedef void (*op_pixels_func)(uint8_t *block /*align width (8 or 16)*/,
+/**
+ * Average and put pixel
+ * Widths can be 16, 8, 4 or 2. For for widths 2 and 4, h is always a positive
+ * multiple of 2; otherwise, it is a positive multiple of 4.
+ */
+typedef void (*op_pixels_func)(uint8_t *block /* align width */,
const uint8_t *pixels /*align 1*/,
ptrdiff_t line_size, int h);
@@ -46,8 +47,8 @@ typedef struct HpelDSPContext {
/**
* Halfpel motion compensation with rounding (a+b+1)>>1.
* this is an array[4][4] of motion compensation functions for 4
- * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+ * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br>
+ * *pixels_tab[ 0->16xH 1->8xH 2->4xH 3->2xH ][ xhalfpel + 2*yhalfpel ]
* @param block destination where the result is stored
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
@@ -58,8 +59,8 @@ typedef struct HpelDSPContext {
/**
* Halfpel motion compensation with rounding (a+b+1)>>1.
* This is an array[4][4] of motion compensation functions for 4
- * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
- * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
+ * horizontal blocksizes (2,4,8,16) and the 4 halfpel positions<br>
+ * *pixels_tab[ 0->16xH 1->8xH 2->4xH 3->2xH ][ xhalfpel + 2*yhalfpel ]
* @param block destination into which the result is averaged (a+b+1)>>1
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
@@ -85,7 +86,7 @@ typedef struct HpelDSPContext {
* Halfpel motion compensation with no rounding (a+b)>>1.
* this is an array[4] of motion compensation functions for 1
* horizontal blocksize (16) and the 4 halfpel positions<br>
- * *pixels_tab[0][ xhalfpel + 2*yhalfpel ]
+ * *pixels_tab[ xhalfpel + 2*yhalfpel ]
* @param block destination into which the result is averaged (a+b)>>1
* @param pixels source
* @param line_size number of bytes in a horizontal line of block
diff --git a/libavcodec/svq3.c b/libavcodec/svq3.c
index 4c4f3018c5..dfcfce77d3 100644
--- a/libavcodec/svq3.c
+++ b/libavcodec/svq3.c
@@ -504,6 +504,7 @@ static inline int svq3_mc_dir(SVQ3Context *s, int size, int
mode,
int dir, int avg)
{
int i, j, k, mx, my, dx, dy, x, y;
+ // 0->16x16,1->8x16,2->16x8,3->8x8,4->4x8,5->8x4,6->4x4
const int part_width = ((size & 5) == 4) ? 4 : 16 >> (size & 1);
const int part_height = 16 >> ((unsigned)(size + 1) / 3);
const int extra_width = (mode == PREDICT_MODE) ? -16 * 6 : 0;
-----------------------------------------------------------------------
Summary of changes:
libavcodec/hpel_template.c | 30 +--
libavcodec/hpeldsp.c | 6 +-
libavcodec/hpeldsp.h | 27 +--
libavcodec/svq3.c | 1 +
libavcodec/x86/fpel.asm | 2 -
libavcodec/x86/fpel.h | 4 -
libavcodec/x86/h264_qpel.c | 46 +---
libavcodec/x86/h264_qpel_8bit.asm | 60 -----
libavcodec/x86/hpeldsp.asm | 202 ++++++----------
libavcodec/x86/hpeldsp.h | 8 -
libavcodec/x86/hpeldsp_init.c | 364 ++++++++---------------------
libavcodec/x86/mpegvideoenc_qns_template.c | 109 ---------
libavcodec/x86/mpegvideoencdsp_init.c | 150 ++++++------
libavcodec/x86/qpeldsp_init.c | 41 ++--
libavcodec/x86/rnd_template.c | 98 --------
libavcodec/x86/rv40dsp_init.c | 12 -
libavcodec/x86/vvc/sao_10bit.asm | 38 ---
libavfilter/vf_gradfun.c | 2 -
libavfilter/x86/vf_gradfun.asm | 42 +---
libavfilter/x86/vf_gradfun_init.c | 22 --
tests/checkasm/Makefile | 1 +
tests/checkasm/checkasm.c | 3 +
tests/checkasm/checkasm.h | 1 +
tests/checkasm/hpeldsp.c | 115 +++++++++
tests/fate/checkasm.mak | 1 +
25 files changed, 440 insertions(+), 945 deletions(-)
delete mode 100644 libavcodec/x86/mpegvideoenc_qns_template.c
delete mode 100644 libavcodec/x86/rnd_template.c
create mode 100644 tests/checkasm/hpeldsp.c
hooks/post-receive
--
_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]