PR #20789 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/20789.patch
>From a0fa1c8e484f06cc9a9e2e3cfe53ec121fb74659 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 30 Oct 2025 08:30:40 +0100 Subject: [PATCH 1/3] avcodec/x86/hevc/add_res: Remove AVX add_residual functions The AVX and SSE2 functions are identical except for the VEX encodings used since e9abef437f0a348c017d4ac8b23a122881c1dc87 and 8b8492452d53293b2ac8c842877fadf7925fc950. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/add_res.asm | 7 +------ libavcodec/x86/hevc/dsp.h | 4 ---- libavcodec/x86/hevc/dsp_init.c | 4 ---- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm index 3ecbd4269c..5d7115620f 100644 --- a/libavcodec/x86/hevc/add_res.asm +++ b/libavcodec/x86/hevc/add_res.asm @@ -117,7 +117,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6 %endmacro -%macro TRANSFORM_ADD_8 0 +INIT_XMM sse2 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) cglobal hevc_add_residual_8_8, 3, 4, 8 pxor m4, m4 @@ -154,12 +154,7 @@ cglobal hevc_add_residual_32_8, 3, 5, 7 dec r4d jg .loop RET -%endmacro -INIT_XMM sse2 -TRANSFORM_ADD_8 -INIT_XMM avx -TRANSFORM_ADD_8 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 diff --git a/libavcodec/x86/hevc/dsp.h b/libavcodec/x86/hevc/dsp.h index 03986b970a..0062699ce0 100644 --- a/libavcodec/x86/hevc/dsp.h +++ b/libavcodec/x86/hevc/dsp.h @@ -172,10 +172,6 @@ void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t s void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); -void ff_hevc_add_residual_8_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); -void ff_hevc_add_residual_16_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); -void ff_hevc_add_residual_32_8_avx(uint8_t *dst, const int16_t *res, ptrdiff_t stride); - void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride); void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride); diff --git a/libavcodec/x86/hevc/dsp_init.c b/libavcodec/x86/hevc/dsp_init.c index 6966340c42..f1558b7e3e 100644 --- a/libavcodec/x86/hevc/dsp_init.c +++ b/libavcodec/x86/hevc/dsp_init.c @@ -877,10 +877,6 @@ void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth) c->idct[0] = ff_hevc_idct_4x4_8_avx; c->idct[1] = ff_hevc_idct_8x8_8_avx; - - c->add_residual[1] = ff_hevc_add_residual_8_8_avx; - c->add_residual[2] = ff_hevc_add_residual_16_8_avx; - c->add_residual[3] = ff_hevc_add_residual_32_8_avx; } if (EXTERNAL_AVX2(cpu_flags)) { c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2; -- 2.49.1 >From 17526beaf2ea13fd7e1484e8af0ae44baee6f8cb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 30 Oct 2025 08:49:38 +0100 Subject: [PATCH 2/3] avcodec/x86/hevc/add_res: Reduce number of registers used This makes these functions use only volatile registers (even on Win64). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/add_res.asm | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm index 5d7115620f..8abfcab893 100644 --- a/libavcodec/x86/hevc/add_res.asm +++ b/libavcodec/x86/hevc/add_res.asm @@ -61,20 +61,16 @@ cglobal hevc_add_residual_4_8, 3, 3, 6 movq m1, [r0+r2] punpcklbw m0, m4 punpcklbw m1, m4 - mova m2, [r1] - mova m3, [r1+16] - paddsw m0, m2 - paddsw m1, m3 + paddsw m0, [r1] + paddsw m1, [r1+16] packuswb m0, m1 movq m2, [r0+r2*2] movq m3, [r0+r3] punpcklbw m2, m4 punpcklbw m3, m4 - mova m6, [r1+32] - mova m7, [r1+48] - paddsw m2, m6 - paddsw m3, m7 + paddsw m2, [r1+32] + paddsw m3, [r1+48] packuswb m2, m3 movq [r0], m0 @@ -88,27 +84,33 @@ cglobal hevc_add_residual_4_8, 3, 3, 6 mova m2, m1 punpcklbw m1, m0 punpckhbw m2, m0 +%if cpuflag(avx2) mova xm5, [r1+%1] mova xm6, [r1+%1+16] -%if cpuflag(avx2) vinserti128 m5, m5, [r1+%1+32], 1 vinserti128 m6, m6, [r1+%1+48], 1 -%endif paddsw m1, m5 paddsw m2, m6 +%else + paddsw m1, [r1+%1] + paddsw m2, [r1+%1+16] +%endif mova m3, [%3] mova m4, m3 punpcklbw m3, m0 punpckhbw m4, m0 +%if cpuflag(avx2) mova xm5, [r1+%1+mmsize*2] mova xm6, [r1+%1+mmsize*2+16] -%if cpuflag(avx2) vinserti128 m5, m5, [r1+%1+96], 1 vinserti128 m6, m6, [r1+%1+112], 1 -%endif paddsw m3, m5 paddsw m4, m6 +%else + paddsw m3, [r1+%1+mmsize*2] + paddsw m4, [r1+%1+mmsize*2+16] +%endif packuswb m1, m2 packuswb m3, m4 @@ -119,7 +121,7 @@ cglobal hevc_add_residual_4_8, 3, 3, 6 INIT_XMM sse2 ; void ff_hevc_add_residual_8_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) -cglobal hevc_add_residual_8_8, 3, 4, 8 +cglobal hevc_add_residual_8_8, 3, 4, 5 pxor m4, m4 lea r3, [r2*3] ADD_RES_SSE_8_8 @@ -129,7 +131,7 @@ cglobal hevc_add_residual_8_8, 3, 4, 8 RET ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) -cglobal hevc_add_residual_16_8, 3, 5, 7 +cglobal hevc_add_residual_16_8, 3, 5, 5 pxor m0, m0 lea r3, [r2*3] mov r4d, 4 @@ -143,7 +145,7 @@ cglobal hevc_add_residual_16_8, 3, 5, 7 RET ; void ff_hevc_add_residual_32_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) -cglobal hevc_add_residual_32_8, 3, 5, 7 +cglobal hevc_add_residual_32_8, 3, 5, 5 pxor m0, m0 mov r4d, 16 .loop: -- 2.49.1 >From 894f415b278a07c9afbe349697bde80bbdab4e11 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Thu, 30 Oct 2025 09:58:13 +0100 Subject: [PATCH 3/3] avcodec/x86/hevc/add_res: Avoid unnecessary modification Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/hevc/add_res.asm | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/libavcodec/x86/hevc/add_res.asm b/libavcodec/x86/hevc/add_res.asm index 8abfcab893..3489e04e2b 100644 --- a/libavcodec/x86/hevc/add_res.asm +++ b/libavcodec/x86/hevc/add_res.asm @@ -27,9 +27,9 @@ cextern pw_1023 %define max_pixels_10 pw_1023 ; the add_res macros and functions were largely inspired by h264_idct.asm from the x264 project -%macro ADD_RES_MMX_4_8 0 - mova m0, [r1] - mova m2, [r1+8] +%macro ADD_RES_MMX_4_8 1 + mova m0, [r1+%1] + mova m2, [r1+%1+8] movd m1, [r0] movd m3, [r0+r2] @@ -50,27 +50,26 @@ INIT_MMX mmxext ; void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride) cglobal hevc_add_residual_4_8, 3, 3, 6 pxor m4, m4 - ADD_RES_MMX_4_8 - add r1, 16 + ADD_RES_MMX_4_8 0 lea r0, [r0+r2*2] - ADD_RES_MMX_4_8 + ADD_RES_MMX_4_8 16 RET -%macro ADD_RES_SSE_8_8 0 +%macro ADD_RES_SSE_8_8 1 movq m0, [r0] movq m1, [r0+r2] punpcklbw m0, m4 punpcklbw m1, m4 - paddsw m0, [r1] - paddsw m1, [r1+16] + paddsw m0, [r1+%1] + paddsw m1, [r1+%1+16] packuswb m0, m1 movq m2, [r0+r2*2] movq m3, [r0+r3] punpcklbw m2, m4 punpcklbw m3, m4 - paddsw m2, [r1+32] - paddsw m3, [r1+48] + paddsw m2, [r1+%1+32] + paddsw m3, [r1+%1+48] packuswb m2, m3 movq [r0], m0 @@ -124,10 +123,9 @@ INIT_XMM sse2 cglobal hevc_add_residual_8_8, 3, 4, 5 pxor m4, m4 lea r3, [r2*3] - ADD_RES_SSE_8_8 - add r1, 64 + ADD_RES_SSE_8_8 0 lea r0, [r0+r2*4] - ADD_RES_SSE_8_8 + ADD_RES_SSE_8_8 64 RET ; void ff_hevc_add_residual_16_8_<opt>(uint8_t *dst, const int16_t *res, ptrdiff_t stride) @@ -292,9 +290,8 @@ cglobal hevc_add_residual_4_10, 3, 3, 6 pxor m2, m2 mova m3, [max_pixels_10] ADD_RES_MMX_4_10 r0, r2, r1 - add r1, 16 lea r0, [r0+2*r2] - ADD_RES_MMX_4_10 r0, r2, r1 + ADD_RES_MMX_4_10 r0, r2, r1+16 RET INIT_XMM sse2 @@ -305,8 +302,7 @@ cglobal hevc_add_residual_8_10, 3, 4, 6 ADD_RES_SSE_8_10 r0, r2, r3, r1 lea r0, [r0+r2*4] - add r1, 64 - ADD_RES_SSE_8_10 r0, r2, r3, r1 + ADD_RES_SSE_8_10 r0, r2, r3, r1+64 RET cglobal hevc_add_residual_16_10, 3, 5, 6 -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
