Patches attached. - Andreas
From a33bb0298c0eaf7816c36be7c5558cfcc5a3f37c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinha...@outlook.com> Date: Fri, 30 May 2025 12:40:16 +0200 Subject: [PATCH 1/4] avcodec/hpeldsp: Remove duplicate pel functions
Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com> --- libavcodec/hpeldsp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libavcodec/hpeldsp.c b/libavcodec/hpeldsp.c index 80494c9749..db0e02ee93 100644 --- a/libavcodec/hpeldsp.c +++ b/libavcodec/hpeldsp.c @@ -314,9 +314,6 @@ CALL_2X_PIXELS(OPNAME ## _pixels16_y2_8_c, \ CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_8_c, \ OPNAME ## _pixels8_xy2_8_c, \ 8) \ -CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_8_c, \ - OPNAME ## _pixels8_8_c, \ - 8) \ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_8_c, \ OPNAME ## _no_rnd_pixels8_x2_8_c, \ 8) \ @@ -330,6 +327,8 @@ CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_8_c, \ #define op_avg(a, b) a = rnd_avg32(a, b) #define op_put(a, b) a = b #define put_no_rnd_pixels8_8_c put_pixels8_8_c +#define put_no_rnd_pixels16_8_c put_pixels16_8_c +#define avg_no_rnd_pixels16_8_c avg_pixels16_8_c PIXOP2(avg, op_avg) PIXOP2(put, op_put) #undef op_avg -- 2.45.2
From 380cf96fd94da3a47e9fd008213b1f7d9e6e454d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinha...@outlook.com> Date: Fri, 30 May 2025 12:55:54 +0200 Subject: [PATCH 2/4] avcodec/x86/hpeldsp_init: Use ff_avg_pixels16_mmxext avg_pixels_tab[0][0] does the same as avg_no_rnd_pixels_tab[0]. Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com> --- libavcodec/x86/hpeldsp_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 4a0513d06d..12edcd9e83 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -122,7 +122,6 @@ CALL_2X_PIXELS(put_pixels16_xy2_mmx, ff_put_pixels8_xy2_mmx, 8) CALL_2X_PIXELS(put_no_rnd_pixels16_x2 ## CPUEXT, ff_put_no_rnd_pixels8_x2 ## CPUEXT, 8) \ CALL_2X_PIXELS(put_pixels16_y2 ## CPUEXT, ff_put_pixels8_y2 ## CPUEXT, 8) \ CALL_2X_PIXELS(put_no_rnd_pixels16_y2 ## CPUEXT, ff_put_no_rnd_pixels8_y2 ## CPUEXT, 8) \ - CALL_2X_PIXELS(avg_pixels16 ## CPUEXT, ff_avg_pixels8 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_x2 ## CPUEXT, ff_avg_pixels8_x2 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_y2 ## CPUEXT, ff_avg_pixels8_y2 ## CPUEXT, 8) \ CALL_2X_PIXELS(avg_pixels16_xy2 ## CPUEXT, ff_avg_pixels8_xy2 ## CPUEXT, 8) \ @@ -170,7 +169,7 @@ static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags) c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; - c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; + c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext; c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; -- 2.45.2
From 268232ef14099c1a46e753840491c94040696532 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinha...@outlook.com> Date: Fri, 30 May 2025 13:54:50 +0200 Subject: [PATCH 3/4] avcodec/hpeldsp_init: Detemplatize Since a51279bbdea0d6db920d71980262bccd0ce78226, hpeldsp_rnd_template.c was only included once and one of the two functions in rnd_template.c was only enabled once. Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com> --- libavcodec/x86/hpeldsp_init.c | 258 +++++++++++++++++++++++++- libavcodec/x86/hpeldsp_rnd_template.c | 202 -------------------- libavcodec/x86/rnd_template.c | 79 -------- 3 files changed, 252 insertions(+), 287 deletions(-) delete mode 100644 libavcodec/x86/hpeldsp_rnd_template.c diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c index 12edcd9e83..6b2ad4494b 100644 --- a/libavcodec/x86/hpeldsp_init.c +++ b/libavcodec/x86/hpeldsp_init.c @@ -22,6 +22,9 @@ * MMX optimization by Nick Kurshev <nickol...@mail.ru> */ +#include <stddef.h> +#include <stdint.h> + #include "libavutil/attributes.h" #include "libavutil/cpu.h" #include "libavutil/x86/cpu.h" @@ -74,19 +77,263 @@ void ff_avg_approx_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, /* MMX no rounding */ #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx #define SET_RND MOVQ_WONE -#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) -#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) #define STATIC static #include "rnd_template.c" -#include "hpeldsp_rnd_template.c" #undef DEF #undef SET_RND -#undef PAVGBP -#undef PAVGB #undef STATIC +// this routine is 'slightly' suboptimal but mostly unused +static void avg_no_rnd_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, + ptrdiff_t line_size, int h) +{ + MOVQ_ZERO(mm7); + MOVQ_WONE(mm6); // =2 for rnd and =1 for no_rnd version + __asm__ volatile( + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm4 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" + "add %3, %1 \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" + "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" + "movq %%mm0, %%mm1 \n\t" + "movq %%mm2, %%mm3 \n\t" + "punpcklbw %%mm7, %%mm0 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpckhbw %%mm7, %%mm1 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "paddusw %%mm2, %%mm0 \n\t" + "paddusw %%mm3, %%mm1 \n\t" + "paddusw %%mm6, %%mm4 \n\t" + "paddusw %%mm6, %%mm5 \n\t" + "paddusw %%mm0, %%mm4 \n\t" + "paddusw %%mm1, %%mm5 \n\t" + "psrlw $2, %%mm4 \n\t" + "psrlw $2, %%mm5 \n\t" + "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" + "packuswb %%mm5, %%mm4 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) + "movq %%mm5, (%2, %%"FF_REG_a") \n\t" + "add %3, %%"FF_REG_a" \n\t" + + "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 + "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" + "movq %%mm2, %%mm3 \n\t" + "movq %%mm4, %%mm5 \n\t" + "punpcklbw %%mm7, %%mm2 \n\t" + "punpcklbw %%mm7, %%mm4 \n\t" + "punpckhbw %%mm7, %%mm3 \n\t" + "punpckhbw %%mm7, %%mm5 \n\t" + "paddusw %%mm2, %%mm4 \n\t" + "paddusw %%mm3, %%mm5 \n\t" + "paddusw %%mm6, %%mm0 \n\t" + "paddusw %%mm6, %%mm1 \n\t" + "paddusw %%mm4, %%mm0 \n\t" + "paddusw %%mm5, %%mm1 \n\t" + "psrlw $2, %%mm0 \n\t" + "psrlw $2, %%mm1 \n\t" + "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" + "packuswb %%mm1, %%mm0 \n\t" + "pcmpeqd %%mm2, %%mm2 \n\t" + "paddb %%mm2, %%mm2 \n\t" + PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) + "movq %%mm1, (%2, %%"FF_REG_a") \n\t" + "add %3, %%"FF_REG_a" \n\t" + + "subl $2, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels) + :"D"(block), "r"((x86_reg)line_size) + :FF_REG_a, "memory"); +} + +static void put_no_rnd_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"FF_REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :FF_REG_a, "memory"); +} + +static void put_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"FF_REG_a" \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%1, %3), %%mm2 \n\t" + "movq 1(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%1, %3), %%mm2 \n\t" + "movq 9(%1, %3), %%mm3 \n\t" + PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) + "movq %%mm4, 8(%2) \n\t" + "movq %%mm5, 8(%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :FF_REG_a, "memory"); +} + +static void put_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"FF_REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"FF_REG_a"),%%mm2\n\t" + PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"FF_REG_a"),%%mm0\n\t" + PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq %%mm4, (%2) \n\t" + "movq %%mm5, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :FF_REG_a, "memory"); +} + +static void avg_no_rnd_pixels16_x2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1), %%mm0 \n\t" + "movq 1(%1), %%mm1 \n\t" + "movq (%2), %%mm3 \n\t" + PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, (%2) \n\t" + "movq 8(%1), %%mm0 \n\t" + "movq 9(%1), %%mm1 \n\t" + "movq 8(%2), %%mm3 \n\t" + PAVGB_MMX_NO_RND(%%mm0, %%mm1, %%mm2, %%mm6) + PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) + "movq %%mm0, 8(%2) \n\t" + "add %3, %1 \n\t" + "add %3, %2 \n\t" + "subl $1, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :"memory"); +} + +static void avg_no_rnd_pixels8_y2_mmx(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) +{ + MOVQ_BFE(mm6); + __asm__ volatile( + "lea (%3, %3), %%"FF_REG_a" \n\t" + "movq (%1), %%mm0 \n\t" + ".p2align 3 \n\t" + "1: \n\t" + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" + PAVGBP_MMX_NO_RND(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm0, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + + "movq (%1, %3), %%mm1 \n\t" + "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" + PAVGBP_MMX_NO_RND(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) + "movq (%2), %%mm3 \n\t" + PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) + "movq (%2, %3), %%mm3 \n\t" + PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) + "movq %%mm2, (%2) \n\t" + "movq %%mm1, (%2, %3) \n\t" + "add %%"FF_REG_a", %1 \n\t" + "add %%"FF_REG_a", %2 \n\t" + + "subl $4, %0 \n\t" + "jnz 1b \n\t" + :"+g"(h), "+S"(pixels), "+D"(block) + :"r"((x86_reg)line_size) + :FF_REG_a, "memory"); +} + #if HAVE_MMX CALL_2X_PIXELS(avg_no_rnd_pixels16_y2_mmx, avg_no_rnd_pixels8_y2_mmx, 8) CALL_2X_PIXELS(put_no_rnd_pixels16_y2_mmx, put_no_rnd_pixels8_y2_mmx, 8) @@ -101,7 +348,6 @@ CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_mmx, put_no_rnd_pixels8_xy2_mmx, 8) #define SET_RND MOVQ_WTWO #define DEF(x, y) ff_ ## x ## _ ## y ## _mmx #define STATIC -#define NO_AVG #include "rnd_template.c" diff --git a/libavcodec/x86/hpeldsp_rnd_template.c b/libavcodec/x86/hpeldsp_rnd_template.c deleted file mode 100644 index 2bff2d2766..0000000000 --- a/libavcodec/x86/hpeldsp_rnd_template.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd - * Copyright (c) 2000, 2001 Fabrice Bellard - * Copyright (c) 2003-2004 Michael Niedermayer <michae...@gmx.at> - * - * MMX optimization by Nick Kurshev <nickol...@mail.ru> - * mostly rewritten by Michael Niedermayer <michae...@gmx.at> - * and improved by Zdenek Kabelac <k...@users.sf.net> - * - * This file is part of FFmpeg. - * - * FFmpeg is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * FFmpeg is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with FFmpeg; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <stddef.h> -#include <stdint.h> - -// put_pixels -av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"FF_REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} - -av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"FF_REG_a" \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%1, %3), %%mm2 \n\t" - "movq 1(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%1, %3), %%mm2 \n\t" - "movq 9(%1, %3), %%mm3 \n\t" - PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) - "movq %%mm4, 8(%2) \n\t" - "movq %%mm5, 8(%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} - -av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"FF_REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"FF_REG_a"),%%mm2\n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"FF_REG_a"),%%mm0\n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq %%mm4, (%2) \n\t" - "movq %%mm5, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} - -av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm1 \n\t" - "movq (%2), %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, (%2) \n\t" - "movq 8(%1), %%mm0 \n\t" - "movq 9(%1), %%mm1 \n\t" - "movq 8(%2), %%mm3 \n\t" - PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) - PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) - "movq %%mm0, 8(%2) \n\t" - "add %3, %1 \n\t" - "add %3, %2 \n\t" - "subl $1, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :"memory"); -} - -av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) -{ - MOVQ_BFE(mm6); - __asm__ volatile( - "lea (%3, %3), %%"FF_REG_a" \n\t" - "movq (%1), %%mm0 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" - PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm0, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - - "movq (%1, %3), %%mm1 \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) - "movq (%2), %%mm3 \n\t" - PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) - "movq (%2, %3), %%mm3 \n\t" - PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) - "movq %%mm2, (%2) \n\t" - "movq %%mm1, (%2, %3) \n\t" - "add %%"FF_REG_a", %1 \n\t" - "add %%"FF_REG_a", %2 \n\t" - - "subl $4, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels), "+D"(block) - :"r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} diff --git a/libavcodec/x86/rnd_template.c b/libavcodec/x86/rnd_template.c index b825eeba6e..4590aeddf0 100644 --- a/libavcodec/x86/rnd_template.c +++ b/libavcodec/x86/rnd_template.c @@ -96,82 +96,3 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel :"D"(block), "r"((x86_reg)line_size) :FF_REG_a, "memory"); } - -#ifndef NO_AVG -// avg_pixels -// this routine is 'slightly' suboptimal but mostly unused -av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, - ptrdiff_t line_size, int h) -{ - MOVQ_ZERO(mm7); - SET_RND(mm6); // =2 for rnd and =1 for no_rnd version - __asm__ volatile( - "movq (%1), %%mm0 \n\t" - "movq 1(%1), %%mm4 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "xor %%"FF_REG_a", %%"FF_REG_a" \n\t" - "add %3, %1 \n\t" - ".p2align 3 \n\t" - "1: \n\t" - "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" - "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" - "movq %%mm0, %%mm1 \n\t" - "movq %%mm2, %%mm3 \n\t" - "punpcklbw %%mm7, %%mm0 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpckhbw %%mm7, %%mm1 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "paddusw %%mm2, %%mm0 \n\t" - "paddusw %%mm3, %%mm1 \n\t" - "paddusw %%mm6, %%mm4 \n\t" - "paddusw %%mm6, %%mm5 \n\t" - "paddusw %%mm0, %%mm4 \n\t" - "paddusw %%mm1, %%mm5 \n\t" - "psrlw $2, %%mm4 \n\t" - "psrlw $2, %%mm5 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "packuswb %%mm5, %%mm4 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) - "movq %%mm5, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 - "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" - "movq %%mm2, %%mm3 \n\t" - "movq %%mm4, %%mm5 \n\t" - "punpcklbw %%mm7, %%mm2 \n\t" - "punpcklbw %%mm7, %%mm4 \n\t" - "punpckhbw %%mm7, %%mm3 \n\t" - "punpckhbw %%mm7, %%mm5 \n\t" - "paddusw %%mm2, %%mm4 \n\t" - "paddusw %%mm3, %%mm5 \n\t" - "paddusw %%mm6, %%mm0 \n\t" - "paddusw %%mm6, %%mm1 \n\t" - "paddusw %%mm4, %%mm0 \n\t" - "paddusw %%mm5, %%mm1 \n\t" - "psrlw $2, %%mm0 \n\t" - "psrlw $2, %%mm1 \n\t" - "movq (%2, %%"FF_REG_a"), %%mm3 \n\t" - "packuswb %%mm1, %%mm0 \n\t" - "pcmpeqd %%mm2, %%mm2 \n\t" - "paddb %%mm2, %%mm2 \n\t" - PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) - "movq %%mm1, (%2, %%"FF_REG_a") \n\t" - "add %3, %%"FF_REG_a" \n\t" - - "subl $2, %0 \n\t" - "jnz 1b \n\t" - :"+g"(h), "+S"(pixels) - :"D"(block), "r"((x86_reg)line_size) - :FF_REG_a, "memory"); -} -#endif -- 2.45.2
From 1754b6a8aba861c2962811a2d1b7f87ec3e130b2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <andreas.rheinha...@outlook.com> Date: Fri, 30 May 2025 14:56:57 +0200 Subject: [PATCH 4/4] avcodec/vc1dsp: Fix vc1op_pixels_func semantics Signed-off-by: Andreas Rheinhardt <andreas.rheinha...@outlook.com> --- libavcodec/vc1dsp.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h index e3b90d2b62..b018537af3 100644 --- a/libavcodec/vc1dsp.h +++ b/libavcodec/vc1dsp.h @@ -30,7 +30,9 @@ #include "hpeldsp.h" #include "h264chroma.h" -typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h); +typedef void (*vc1op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, + const uint8_t *pixels/*align 1*/, + ptrdiff_t line_size, int round); typedef struct VC1DSPContext { /* vc1 functions */ -- 2.45.2
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".