PR #22356 opened by mkver URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22356 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/22356.patch
>From 63a61e32a694e9929fc0a73ac4c0a7c35b1616b1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 13:45:18 +0100 Subject: [PATCH 01/28] avcodec/x86/vvc/of: Avoid unnecessary additions BDOF_PROF_GRAD just adds some values to m12,m13, so one can avoid two pxor, paddw by deferring saving these registers prematurely. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 232dc1c2fd..4a28550690 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -327,15 +327,11 @@ INIT_YMM avx2 BDOF_PROF_GRAD 0, 0 %endif - mova m14, m12 - mova m15, m13 - - pxor m12, m12 - pxor m13, m13 BDOF_PROF_GRAD %1 * 4 + 1, 0 BDOF_PROF_GRAD %1 * 4 + 2, 0 - paddw m14, m12 - paddw m15, m13 + + mova m14, m12 + mova m15, m13 pxor m12, m12 pxor m13, m13 -- 2.52.0 >From 3edac4576b92612752d62cf3baeb6f898d922191 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 14:57:59 +0100 Subject: [PATCH 02/28] avcodec/x86/vvc/of: Avoid initialization, addition for first block Output directly to the desired destination registers instead of zeroing them, followed by adding the desired values. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 4a28550690..8ad68aa16f 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -230,14 +230,20 @@ INIT_YMM avx2 pshufhw m6, m6, q2301 paddw m8, m6, m11 ; 4 x (4sgx2, 4sgy2, 4sgxdi, 4sgydi) -%if (%1) == 0 || (%2) - ; pad for top and bottom +%if (%1) == 0 + ; pad for top and directly output to m12, m13 + paddw m12, m8, m8 + paddw m13, m10, m10 +%else +%if (%2) + ; pad for bottom paddw m8, m8 paddw m10, m10 %endif paddw m12, m8 paddw m13, m10 +%endif %endmacro @@ -321,9 +327,6 @@ INIT_YMM avx2 movu m3, [src1q + 0 * SRC_STRIDE + SRC_PS] movu m4, [src1q + 1 * SRC_STRIDE + SRC_PS] - pxor m12, m12 - pxor m13, m13 - BDOF_PROF_GRAD 0, 0 %endif -- 2.52.0 >From 270202f2aa4289660ff5d75ab6ae5dfc4a892dfc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 15:21:23 +0100 Subject: [PATCH 03/28] avcodec/x86/vvc/of: Avoid initialization, addition for last block When processing the last block, we no longer need to preserve some registers for the next block, allowing simplifications. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 8ad68aa16f..b77e1fdf68 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -333,21 +333,25 @@ INIT_YMM avx2 BDOF_PROF_GRAD %1 * 4 + 1, 0 BDOF_PROF_GRAD %1 * 4 + 2, 0 +%if (%2) + BDOF_PROF_GRAD %1 * 4 + 3, %2 + BDOF_VX_VY 12, 13 + APPLY_BDOF_MIN_BLOCK %1, m12, m13, bd +%else mova m14, m12 mova m15, m13 pxor m12, m12 pxor m13, m13 - BDOF_PROF_GRAD %1 * 4 + 3, %2 -%if (%2) == 0 + BDOF_PROF_GRAD %1 * 4 + 3, 0 BDOF_PROF_GRAD %1 * 4 + 4, 0 -%endif paddw m14, m12 paddw m15, m13 BDOF_VX_VY 14, 15 APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd lea dstq, [dstq + 4 * dsq] +%endif %endmacro ;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, -- 2.52.0 >From b092da9a23ce29d9733aeb1006956791b06eec0f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 17:08:44 +0100 Subject: [PATCH 04/28] avcodec/x86/vvc/of: Avoid unnecessary jumps For 8bpp width 8 content, an unnecessary jump was performed for every write: First to the end of the SAVE_8BPC macro, then to the end of the SAVE macro. This commit changes this. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index b77e1fdf68..180309ef63 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -72,17 +72,17 @@ INIT_YMM avx2 CLIPW %1, m9, m10 %endmacro -%macro SAVE_8BPC 2 ; dst, src +%macro SAVE_8BPC 3 ; dst, src, jmp dst packuswb m%2, m%2 vpermq m%2, m%2, q0020 cmp wd, 16 je %%w16 movq %1, xm%2 - jmp %%wend + jmp %3 %%w16: movu %1, xm%2 -%%wend: + jmp %3 %endmacro %macro SAVE_16BPC 2 ; dst, src @@ -98,8 +98,7 @@ INIT_YMM avx2 %macro SAVE 2 ; dst, src cmp pixel_maxd, (1 << 8) - 1 jne %%save_16bpc - SAVE_8BPC %1, %2 - jmp %%end + SAVE_8BPC %1, %2, %%end %%save_16bpc: SAVE_16BPC %1, %2 %%end: -- 2.52.0 >From deac205dc309ccc1d0cc002a336de4f8091a284a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 18:01:39 +0100 Subject: [PATCH 05/28] avcodec/x86/vvc/of: Deduplicate writing, save jump Both the 8bpp width 16 and >8bpp width 8 cases write 16 contiguous bytes; deduplicate writing them. In fact, by putting this block of code at the end of the SAVE macro, one can even save a jmp for the width 16 8bpp case (without adversely affecting the other cases). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 180309ef63..895535c754 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -72,35 +72,27 @@ INIT_YMM avx2 CLIPW %1, m9, m10 %endmacro -%macro SAVE_8BPC 3 ; dst, src, jmp dst +%macro SAVE 2 ; dst, src + cmp pixel_maxd, (1 << 8) - 1 + jne %%save_16bpc + packuswb m%2, m%2 vpermq m%2, m%2, q0020 cmp wd, 16 - je %%w16 + je %%w16_8 movq %1, xm%2 - jmp %3 -%%w16: - movu %1, xm%2 - jmp %3 -%endmacro + jmp %%end -%macro SAVE_16BPC 2 ; dst, src - cmp wd, 16 - je %%w16 - movu %1, xm%2 - jmp %%wend -%%w16: - movu %1, m%2 -%%wend: -%endmacro - -%macro SAVE 2 ; dst, src - cmp pixel_maxd, (1 << 8) - 1 - jne %%save_16bpc - SAVE_8BPC %1, %2, %%end %%save_16bpc: - SAVE_16BPC %1, %2 + cmp wd, 16 + jne %%w8_16 + movu %1, m%2 + jmp %%end + +%%w16_8: +%%w8_16: + movu %1, xm%2 %%end: %endmacro -- 2.52.0 >From c8194d95451c3fac0acbcfaa1ad8370fbd9d38f2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 22 Feb 2026 19:19:14 +0100 Subject: [PATCH 06/28] avcodec/x86/vvc/of,dsp_init: Avoid unnecessary wrappers Write them in assembly instead; this exchanges a call+ret with a jmp and also avoids the stack for (1<<bpp)-1. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/dsp_init.c | 30 +++++++++--------------------- libavcodec/x86/vvc/of.asm | 28 ++++++++++++++++++++-------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 37ddbcb73b..158308fb33 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -50,24 +50,12 @@ DMVR_PROTOTYPES( 8, avx2) DMVR_PROTOTYPES(10, avx2) DMVR_PROTOTYPES(12, avx2) -#if ARCH_X86_64 && HAVE_AVX2_EXTERNAL -void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *src0, const int16_t *src1, - int w, int h, int pixel_max); - -#define OF_FUNC(bd, opt) \ -static void vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ - const int16_t *src0, const int16_t *src1, int w, int h) \ -{ \ - ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \ -} \ - -OF_FUNC( 8, avx2) -OF_FUNC(10, avx2) -OF_FUNC(12, avx2) - -#define OF_INIT(bd) c->inter.apply_bdof = vvc_apply_bdof_##bd##_avx2 -#endif +#define OF_INIT(BD, OPT) do { \ +void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, \ + const int16_t *src0, const int16_t *src1, \ + int w, int h); \ + c->inter.apply_bdof = ff_vvc_apply_bdof_## BD ##_## OPT; \ +} while (0) #define ALF_BPC_PROTOTYPES(bpc, opt) \ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ @@ -340,7 +328,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) AVG_INIT(8, avx2); DMVR_INIT(8); MC_LINKS_AVX2(8); - OF_INIT(8); + OF_INIT(8, avx2); SAD_INIT(); // filter @@ -362,7 +350,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) DMVR_INIT(10); MC_LINKS_AVX2(10); MC_LINKS_16BPC_AVX2(10); - OF_INIT(10); + OF_INIT(10, avx2); SAD_INIT(); // filter @@ -384,7 +372,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) DMVR_INIT(12); MC_LINKS_AVX2(12); MC_LINKS_16BPC_AVX2(12); - OF_INIT(12); + OF_INIT(12, avx2); SAD_INIT(); // filter diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 895535c754..5184144739 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -345,11 +345,27 @@ INIT_YMM avx2 %endif %endmacro -;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, int16_t *src0, int16_t *src1, -; const int w, const int h, const int int pixel_max) -%macro BDOF_AVX2 0 -cglobal vvc_apply_bdof, 7, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0 +%macro BDOF_WRAPPER 2 ; bpp, is_nonadjacent +;void ff_vvc_apply_bdof_%1(uint8_t *dst, const ptrdiff_t dst_stride, const int16_t *src0, +; const int16_t *src1, const int w, const int h) +cglobal vvc_apply_bdof_%1 + ; r6 is not used for parameter passing and is volatile both on UNIX64 + ; and Win64, so it can be freely used + mov r6d, (1<<%1)-1 +%if %2 + jmp vvc_apply_bdof_ %+ cpuname +%endif +%endmacro +%macro VVC_OF_AVX2 0 + BDOF_WRAPPER 12, 1 + BDOF_WRAPPER 8, 1 + BDOF_WRAPPER 10, 0 + +vvc_apply_bdof_ %+ cpuname: +; the prologue on Win64 is big (10 xmm regs need saving), so use PROLOGUE +; to avoid duplicating it. +PROLOGUE 6, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3, tmp0 lea ds3q, [dsq * 3] sub src0q, SRC_STRIDE + SRC_PS sub src1q, SRC_STRIDE + SRC_PS @@ -370,10 +386,6 @@ cglobal vvc_apply_bdof, 7, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, RET %endmacro -%macro VVC_OF_AVX2 0 - BDOF_AVX2 -%endmacro - VVC_OF_AVX2 %endif ; HAVE_AVX2_EXTERNAL -- 2.52.0 >From b6071c904f46e71d5ba958bcec49f27f4815db10 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 15:37:52 +0100 Subject: [PATCH 07/28] avcodec/x86/vvc/of: Only clip for >8bpp packuswb does it already for 8bpp. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 5184144739..64c972786c 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -69,7 +69,6 @@ INIT_YMM avx2 paddw %4, [src1q + (%5 + 1) * SRC_STRIDE + SRC_PS] paddsw %1, %4 ; src0[x] + src1[x] + bdof_offset pmulhrsw %1, m11 - CLIPW %1, m9, m10 %endmacro %macro SAVE 2 ; dst, src @@ -85,6 +84,7 @@ INIT_YMM avx2 jmp %%end %%save_16bpc: + CLIPW m%2, m9, m10 cmp wd, 16 jne %%w8_16 movu %1, m%2 -- 2.52.0 >From fb085fdbc7a884036b3949df0b3b52ccba8bf993 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 15:39:39 +0100 Subject: [PATCH 08/28] avcodec/x86/vvc/of: Ignore upper lane for width 8 Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 64c972786c..14a9ae6898 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -76,7 +76,6 @@ INIT_YMM avx2 jne %%save_16bpc packuswb m%2, m%2 - vpermq m%2, m%2, q0020 cmp wd, 16 je %%w16_8 @@ -91,6 +90,7 @@ INIT_YMM avx2 jmp %%end %%w16_8: + vpermq m%2, m%2, q0020 %%w8_16: movu %1, xm%2 %%end: -- 2.52.0 >From 224c68c8c8a759bebe412061fed8f52a193a677f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 16:25:32 +0100 Subject: [PATCH 09/28] avcodec/x86/vvc/of: Avoid jump At the end of the height==8 codepath, a jump to RET at the end of the height==16 codepath is performed. Yet the epilogue is so cheap on Unix64 that this jump is not worthwhile. For Win64 meanwhile, one can still avoid jumps, because for width 16 >8bpp and width 8 8bpp content a jump is performed to the end of the height==8 position, immediately followed by a jump to RET. These two jumps can be combined into one. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/of.asm | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vvc/of.asm b/libavcodec/x86/vvc/of.asm index 14a9ae6898..eca52f244f 100644 --- a/libavcodec/x86/vvc/of.asm +++ b/libavcodec/x86/vvc/of.asm @@ -71,7 +71,7 @@ INIT_YMM avx2 pmulhrsw %1, m11 %endmacro -%macro SAVE 2 ; dst, src +%macro SAVE 2-3 ""; dst, src, jump target cmp pixel_maxd, (1 << 8) - 1 jne %%save_16bpc @@ -80,14 +80,22 @@ INIT_YMM avx2 cmp wd, 16 je %%w16_8 movq %1, xm%2 +%ifnidn %3, "" + jmp %3 +%else jmp %%end +%endif %%save_16bpc: CLIPW m%2, m9, m10 cmp wd, 16 jne %%w8_16 movu %1, m%2 +%ifnidn %3, "" + jmp %3 +%else jmp %%end +%endif %%w16_8: vpermq m%2, m%2, q0020 @@ -98,7 +106,7 @@ INIT_YMM avx2 ; [rsp + even * mmsize] are gradient_h[0] - gradient_h[1] ; [rsp + odd * mmsize] are gradient_v[0] - gradient_v[1] -%macro APPLY_BDOF_MIN_BLOCK 4 ; block_num, vx, vy, bd +%macro APPLY_BDOF_MIN_BLOCK 3-4 ""; block_num, vx, vy, jump target pxor m9, m9 movd xm10, pixel_maxd @@ -118,7 +126,7 @@ INIT_YMM avx2 SAVE [dstq + 2 * dsq], 6 APPLY_BDOF_MIN_BLOCK_LINE m6, %2, %3, m7, (%1) * 4 + 3 - SAVE [dstq + ds3q], 6 + SAVE [dstq + ds3q], 6, %4 %endmacro %macro SUM_MIN_BLOCK_W16 4 ; src/dst, shuffle, perm, tmp @@ -327,7 +335,12 @@ INIT_YMM avx2 %if (%2) BDOF_PROF_GRAD %1 * 4 + 3, %2 BDOF_VX_VY 12, 13 - APPLY_BDOF_MIN_BLOCK %1, m12, m13, bd +%if UNIX64 + APPLY_BDOF_MIN_BLOCK %1, m12, m13 +%else + APPLY_BDOF_MIN_BLOCK %1, m12, m13, .end +%endif + %else mova m14, m12 mova m15, m13 @@ -340,7 +353,7 @@ INIT_YMM avx2 paddw m15, m13 BDOF_VX_VY 14, 15 - APPLY_BDOF_MIN_BLOCK %1, m14, m15, bd + APPLY_BDOF_MIN_BLOCK %1, m14, m15 lea dstq, [dstq + 4 * dsq] %endif %endmacro @@ -375,7 +388,11 @@ PROLOGUE 6, 9, 16, BDOF_STACK_SIZE*32, dst, ds, src0, src1, w, h, pixel_max, ds3 cmp hd, 16 je .h16 BDOF_MINI_BLOCKS 1, 1 +%if UNIX64 + RET +%else jmp .end +%endif .h16: BDOF_MINI_BLOCKS 1, 0 -- 2.52.0 >From 0fa5900cf8b1d1d03b1ea5a2732948c6e753e858 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 17:42:20 +0100 Subject: [PATCH 10/28] avcodec/x86/vvc/alf: Use immediate for shift when possible Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index ccb236294a..572427f98a 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -760,12 +760,18 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w pblendvb m2, m5, m8, m1 ; hvd1 pblendvb m3, m6, m9, m1 ; hvd0 +%if ps != 1 ; high bit depth movd xm5, bit_depthd vpbroadcastd m5, xm5 +%endif ;*class_idx = arg_var[av_clip_uintp2(sum_hv * ac >> (BIT_DEPTH - 1), 4)]; pmulld m0, m14 ; sum_hv * ac +%if ps != 1 vpsrlvd m0, m0, m5 +%else + psrld m0, 8 +%endif pminsd m0, [dd15] movu m6, [ARG_VAR_SHUFFE] pshufb m6, m0 ; class_idx @@ -818,7 +824,9 @@ ALF_CLASSIFY_GRAD %1 cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_sum, width, height, vb_pos, bit_depth, \ x, y, grad, sum_stride, sum_stride3, temp, w +%if ps != 1 sub bit_depthq, 1 +%endif ; now we can use gradient to get class idx and transpose idx lea sum_strideq, [widthd + ALF_GRADIENT_BORDER * 2] -- 2.52.0 >From d699fafe8e761b73ecccdfff585426ce7656490f Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 17:50:52 +0100 Subject: [PATCH 11/28] avcodec/x86/vvc/alf: Remove unused array Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 1 - 1 file changed, 1 deletion(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 572427f98a..a99703f299 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -41,7 +41,6 @@ PARAM_SHUFFE 2 PARAM_SHUFFE 3 CLASSIFY_SHUFFE: times 2 db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 -TRANSPOSE_PERMUTE: dd 0, 1, 4, 5, 2, 3, 6, 7 ARG_VAR_SHUFFE: times 2 db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 cextern pd_64 -- 2.52.0 >From 23efc851e01eb37df5bd79bac7fb456eaf2f42ed Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 18:03:47 +0100 Subject: [PATCH 12/28] avcodec/x86/vvc/alf: Don't clip for 8bpp packuswb does it already. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index a99703f299..38fa04a19e 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -409,8 +409,10 @@ SECTION .text ; sum += curr paddsw m0, m2 +%if ps != 1 ; clip to pixel CLIPW m0, m14, m15 +%endif STORE_PIXELS dstq, 0, %1 @@ -443,18 +445,20 @@ SECTION .text %else %xdefine LUMA 0 %endif +%define ps (%1 / 8) ; pixel size ; ****************************** ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, ; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 16, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ +cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ offset, x, s5, s6 -%define ps (%1 / 8) ; pixel size +%if ps != 1 movd xm15, pixel_maxd vpbroadcastw m15, xm15 pxor m14, m14 +%endif .loop: push srcq -- 2.52.0 >From 93114a916cd6d3bbbffbcee60d4574f1d25b19a9 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 18:32:30 +0100 Subject: [PATCH 13/28] avcodec/x86/vvc/alf: Avoid checking twice Also avoid doing unnecessary work in the width==8 case. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 38fa04a19e..ed83134cd4 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -353,13 +353,12 @@ SECTION .text cmp %3, 8 jl .w4 STORE_PIXELS_W8 %1, %2 - cmp %3, 12 + je .end %if ps == 2 vpermq m%2, m%2, q0302 %else vpermq m%2, m%2, q0101 %endif - jl .end STORE_PIXELS_W4 %1, %2, 8 jmp .end .w4: -- 2.52.0 >From 907e5a9460328cf5f2880b051a14e2708ac653ab Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 19:21:51 +0100 Subject: [PATCH 14/28] avcodec/x86/vvc/alf: Improve storing 8bpp When width is known to be 8 (i.e. for luma that is not width 16), the upper lane is unused, so use an xmm-sized packuswb and avoid the vpermq altogether. For chroma not known to be 16 (i.e. 4,8 or 12) defer extracting from the high lane until it is known to be needed. Also do so via vextracti128 instead of vpermq (also do this for bpp>8). Also use vextracti128 and an xmm-sized packuswb in case of width 16 instead of an ymm-sized packuswb followed by vextracti128. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index ed83134cd4..8798d7b3c9 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -354,11 +354,7 @@ SECTION .text jl .w4 STORE_PIXELS_W8 %1, %2 je .end - %if ps == 2 - vpermq m%2, m%2, q0302 - %else - vpermq m%2, m%2, q0101 - %endif + vextracti128 xm%2, m%2, 1 STORE_PIXELS_W4 %1, %2, 8 jmp .end .w4: @@ -366,19 +362,24 @@ SECTION .text .end: %endmacro -; STORE_PIXELS(dst, src, width) -%macro STORE_PIXELS 3 - %if ps == 1 - packuswb m%2, m%2 - vpermq m%2, m%2, 0x8 - %endif - +; STORE_PIXELS(dst, src, width, tmp reg) +%macro STORE_PIXELS 4 %ifidn %3, 16 + %if ps == 1 + vextracti128 xm%4, m%2, 1 + packuswb xm%2, xm%4 + %endif STORE_PIXELS_W16 %1, %2 %else %if LUMA + %if ps == 1 + packuswb xm%2, xm%2 + %endif STORE_PIXELS_W8 %1, %2 %else + %if ps == 1 + packuswb m%2, m%2 + %endif STORE_PIXELS_W8LE %1, %2, %3 %endif %endif @@ -413,7 +414,7 @@ SECTION .text CLIPW m0, m14, m15 %endif - STORE_PIXELS dstq, 0, %1 + STORE_PIXELS dstq, 0, %1, 2 lea srcq, [srcq + src_strideq] lea dstq, [dstq + dst_strideq] -- 2.52.0 >From bc4762362903664e412054a79f2a83a9cc366d8a Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 21:44:38 +0100 Subject: [PATCH 15/28] avcodec/x86/vvc/alf: Use xmm registers where sufficient One always has eight samples when processing the luma remainder, so xmm registers are sufficient for everything. In fact, this actually simplifies loading the luma parameters. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 8798d7b3c9..9563ae74d5 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -59,15 +59,15 @@ SECTION .text ;%1-%3 out ;%4 clip or filter -%macro LOAD_LUMA_PARAMS_W16 4 +%macro LOAD_LUMA_PARAMS 4 lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE - movu m%1, [%4q + 2 * offsetq + 0 * 32] ; 2 * for sizeof(int16_t) - movu m%2, [%4q + 2 * offsetq + 1 * 32] - movu m%3, [%4q + 2 * offsetq + 2 * 32] + movu m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for sizeof(int16_t) + movu m%2, [%4q + 2 * offsetq + 1 * mmsize] + movu m%3, [%4q + 2 * offsetq + 2 * mmsize] %endmacro %macro LOAD_LUMA_PARAMS_W16 6 - LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4 + LOAD_LUMA_PARAMS %1, %2, %3, %4 ;m%1 = 03 02 01 00 ;m%2 = 07 06 05 04 ;m%3 = 11 10 09 08 @@ -84,11 +84,26 @@ SECTION .text vpermpd m%3, m%3, 10000111b ;11 08 05 02 %endmacro +%macro LOAD_LUMA_PARAMS_W8 5 + LOAD_LUMA_PARAMS %2, %3, %5, %4 + ;m%2 = 01 00 + ;m%3 = 03 02 + ;m%5 = 05 04 + + shufpd m%1, m%2, m%3, 10b ;03 00 + shufpd m%2, m%2, m%5, 01b ;04 01 + shufpd m%3, m%3, m%5, 10b ;05 02 +%endmacro + ; %1-%3 out ; %4 clip or filter ; %5-%6 tmp %macro LOAD_LUMA_PARAMS 6 +%if mmsize == 32 LOAD_LUMA_PARAMS_W16 %1, %2, %3, %4, %5, %6 +%else + LOAD_LUMA_PARAMS_W8 %1, %2, %3, %4, %5 +%endif %endmacro %macro LOAD_CHROMA_PARAMS 4 @@ -483,8 +498,14 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 0-0x30, dst, dst_stride, cmp widthq, 0 je .w_end +%if LUMA +INIT_XMM cpuname +%endif LOAD_PARAMS FILTER_16x4 widthq +%if LUMA +INIT_YMM cpuname +%endif .w_end: -- 2.52.0 >From 00a9a069599e20640d2f8a704d26dc390386f548 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 21:48:06 +0100 Subject: [PATCH 16/28] avcodec/x86/vvc/alf: Don't calculate twice Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 9563ae74d5..c8a6565e72 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -60,7 +60,6 @@ SECTION .text ;%1-%3 out ;%4 clip or filter %macro LOAD_LUMA_PARAMS 4 - lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE movu m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for sizeof(int16_t) movu m%2, [%4q + 2 * offsetq + 1 * mmsize] movu m%3, [%4q + 2 * offsetq + 2 * mmsize] @@ -116,6 +115,7 @@ SECTION .text %macro LOAD_PARAMS 0 %if LUMA + lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7 LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10 %else -- 2.52.0 >From 2fb29c8624a3ba43a764c5c8f6cb18ee29b15477 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 22:19:58 +0100 Subject: [PATCH 17/28] avcodec/x86/vvc/alf: Avoid nonvolatile registers Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index c8a6565e72..c6988b9fcb 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -467,8 +467,20 @@ SECTION .text ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, ; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 14+2*(ps!=1), 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ +cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ offset, x, s5, s6 +%if !LUMA +; chroma does not use registers m5 and m8. Swap them to reduce the amount +; of nonvolatile registers on Win64. It also reduces codesize generally +; as encodings with high registers (m8-m15) take more bytes. + %if ps != 1 + SWAP 5,15 + SWAP 8,14 + %else + SWAP 5,12 + SWAP 8,13 + %endif +%endif %if ps != 1 movd xm15, pixel_maxd vpbroadcastw m15, xm15 -- 2.52.0 >From de96386dbd1f0599ae5406206d3309b0e512349b Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sat, 28 Feb 2026 23:44:50 +0100 Subject: [PATCH 18/28] avcodec/x86/vvc/alf: Avoid checking twice Also avoids a vpermq in case width is eight. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index c6988b9fcb..f669375ed9 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -675,10 +675,9 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w cmp wd, 8 jl %%w4 SAVE_CLASSIFY_PARAM_W8 tempq, %2 + je %%end vpermq m%2, m%2, 00010011b add tempq, 8 - cmp wd, 8 - je %%end %%w4: SAVE_CLASSIFY_PARAM_W4 tempq, %2 %%end: -- 2.52.0 >From ac5371bc8f9599288694696087d7b267f87bc5fc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 1 Mar 2026 00:22:44 +0100 Subject: [PATCH 19/28] avcodec/x86/vvc/alf: Improve writing classify parameters The permutation that was applied before the write macro is actually only beneficial when one has 16 entries to write, so move it into the macro to write 16 entries and optimize the other macro. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index f669375ed9..d27e1e0cfc 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -649,23 +649,23 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w ; SAVE_CLASSIFY_PARAM_W16(dest, src) %macro SAVE_CLASSIFY_PARAM_W16 2 + vpermq m%2, m%2, 11011000b lea tempq, [%1q + xq] movu [tempq], xm%2 - vperm2i128 m%2, m%2, m%2, 1 + vextracti128 xm%2, m%2, 1 movu [tempq + widthq], xm%2 %endmacro ; SAVE_CLASSIFY_PARAM_W8 %macro SAVE_CLASSIFY_PARAM_W8 2 movq [%1], xm%2 - vperm2i128 m%2, m%2, m%2, 1 - movq [%1 + widthq], xm%2 + movhps [%1 + widthq], xm%2 %endmacro ; SAVE_CLASSIFY_PARAM_W4 %macro SAVE_CLASSIFY_PARAM_W4 2 movd [%1], xm%2 - vperm2i128 m%2, m%2, m%2, 1 + punpckhqdq xm%2, xm%2 movd [%1 + widthq], xm%2 %endmacro @@ -676,7 +676,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w jl %%w4 SAVE_CLASSIFY_PARAM_W8 tempq, %2 je %%end - vpermq m%2, m%2, 00010011b + vextracti128 xm%2, m%2, 1 add tempq, 8 %%w4: SAVE_CLASSIFY_PARAM_W4 tempq, %2 @@ -775,7 +775,6 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w paddd m11, m7, m7 paddd m11, m4 paddd m10, m11 - vpermq m10, m10, 11011000b SAVE_CLASSIFY_PARAM transpose_idx, 10 psrlq m10, m8, 32 @@ -832,7 +831,6 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w pandn m1, m7 paddd m1, m1 ; dir1 << 1 paddd m6, m1 ; class_idx - vpermq m6, m6, 11011000b SAVE_CLASSIFY_PARAM class_idx, 6 %endmacro -- 2.52.0 >From 0b300fab4a6ecd7320ca304aaaca4df78de7e66c Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 1 Mar 2026 00:46:37 +0100 Subject: [PATCH 20/28] avcodec/x86/vvc/alf: Use memory sources directly Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index d27e1e0cfc..cf99f1265c 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -107,9 +107,8 @@ SECTION .text %macro LOAD_CHROMA_PARAMS 4 ; LOAD_CHROMA_PARAMS_W %+ WIDTH %1, %2, %3, %4 - movq xm%1, [%3q] + vpbroadcastq m%1, [%3q] movd xm%2, [%3q + 8] - vpbroadcastq m%1, xm%1 vpbroadcastq m%2, xm%2 %endmacro @@ -602,8 +601,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w pblendw m0, m1, m6, 0x55 paddw m0, m0 ; c - movu m1, [CLASSIFY_SHUFFE] - pshufb m1, m0, m1 ; d + pshufb m1, m0, [CLASSIFY_SHUFFE] ; d paddw m9, m14 ; n + s psubw m9, m0 ; (n + s) - c -- 2.52.0 >From 42b58f29409cd5801b80b1afdd993f79adffa860 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 1 Mar 2026 02:10:17 +0100 Subject: [PATCH 21/28] avcodec/x86/vvc/alf: Don't use 64bit where unnecessary Reduces codesize (avoids REX prefixes). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 64 +++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index cf99f1265c..7393f39f8f 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -187,12 +187,12 @@ SECTION .text neg src_strideq %if LUMA - cmp vb_posq, 0 + cmp vb_posd, 0 je %%vb_bottom - cmp vb_posq, 4 + cmp vb_posd, 4 jne %%vb_end %else - cmp vb_posq, 2 + cmp vb_posd, 2 jne %%vb_end cmp %1, 2 jge %%vb_bottom @@ -206,23 +206,23 @@ SECTION .text ; p4 = (y + i >= vb_pos - 2) ? p2 : p4; ; p5 = (y + i >= vb_pos - 3) ? p3 : p5; ; p6 = (y + i >= vb_pos - 3) ? p4 : p6; - dec vb_posq - cmp vb_posq, %1 + dec vb_posd + cmp vb_posd, %1 cmove s1q, srcq cmove s2q, srcq - dec vb_posq - cmp vb_posq, %1 + dec vb_posd + cmp vb_posd, %1 cmovbe s3q, s1q cmovbe s4q, s2q - dec vb_posq + dec vb_posd %if LUMA - cmp vb_posq, %1 + cmp vb_posd, %1 cmovbe s5q, s3q cmovbe s6q, s4q %endif - add vb_posq, 3 + add vb_posd, 3 jmp %%vb_end %%vb_bottom: @@ -233,22 +233,22 @@ SECTION .text ; p4 = (y + i <= vb_pos + 1) ? p2 : p4; ; p5 = (y + i <= vb_pos + 2) ? p3 : p5; ; p6 = (y + i <= vb_pos + 2) ? p4 : p6; - cmp vb_posq, %1 + cmp vb_posd, %1 cmove s1q, srcq cmove s2q, srcq - inc vb_posq - cmp vb_posq, %1 + inc vb_posd + cmp vb_posd, %1 cmovae s3q, s1q cmovae s4q, s2q - inc vb_posq + inc vb_posd %if LUMA - cmp vb_posq, %1 + cmp vb_posd, %1 cmovae s5q, s3q cmovae s6q, s4q %endif - sub vb_posq, 2 + sub vb_posd, 2 %%vb_end: %endmacro @@ -266,18 +266,18 @@ SECTION .text je %%near_below jmp %%no_vb %%near_above: - cmp vb_posq, 4 + cmp vb_posd, 4 je %%near_vb jmp %%no_vb %%near_below: - cmp vb_posq, 0 + cmp vb_posd, 0 je %%near_vb %else cmp %1, 0 je %%no_vb cmp %1, 3 je %%no_vb - cmp vb_posq, 2 + cmp vb_posd, 2 je %%near_vb %endif %%no_vb: @@ -414,11 +414,11 @@ SECTION .text %define s4q offsetq push xq - xor xq, xq + xor xd, xd %%filter_16x4_loop: LOAD_PIXELS m2, [srcq] ;p0 - FILTER_VB xq + FILTER_VB xd ; sum += curr paddsw m0, m2 @@ -432,8 +432,8 @@ SECTION .text lea srcq, [srcq + src_strideq] lea dstq, [dstq + dst_strideq] - inc xq - cmp xq, 4 + inc xd + cmp xd, 4 jl %%filter_16x4_loop mov xq, src_strideq @@ -490,10 +490,10 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s push srcq push dstq push widthq - xor xq, xq + xor xd, xd .loop_w: - cmp widthq, 16 + cmp widthd, 16 jl .loop_w_end LOAD_PARAMS @@ -501,19 +501,19 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s add srcq, 16 * ps add dstq, 16 * ps - add xq, 16 - sub widthq, 16 + add xd, 16 + sub widthd, 16 jmp .loop_w .loop_w_end: - cmp widthq, 0 + cmp widthd, 0 je .w_end %if LUMA INIT_XMM cpuname %endif LOAD_PARAMS - FILTER_16x4 widthq + FILTER_16x4 widthd %if LUMA INIT_YMM cpuname %endif @@ -529,8 +529,8 @@ INIT_YMM cpuname lea filterq, [filterq + 2 * strideq] lea clipq, [clipq + 2 * strideq] - sub vb_posq, 4 - sub heightq, 4 + sub vb_posd, 4 + sub heightd, 4 jg .loop RET %endmacro @@ -856,7 +856,7 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_su x, y, grad, sum_stride, sum_stride3, temp, w %if ps != 1 - sub bit_depthq, 1 + sub bit_depthd, 1 %endif ; now we can use gradient to get class idx and transpose idx -- 2.52.0 >From 77beb53d81955d8a1920f3a1eecfbb9f5b7ffcbf Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 1 Mar 2026 03:22:28 +0100 Subject: [PATCH 22/28] avcodec/x86/vvc/alf: Avoid broadcast Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 7393f39f8f..cd4de6185d 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -688,13 +688,12 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w movu m1, [gradq + sum_strideq] movu m2, [gradq + 2 * sum_strideq] - pcmpeqb m11, m11 movd xm13, yd - vpbroadcastd m13, xm13 movd xm12, vb_posd - vpbroadcastd m12, xm12 - pcmpeqd m13, m12 ; y == vb_pos - pandn m13, m11 ; y != vb_pos + pcmpeqb xm11, xm11 + pcmpeqd xm13, xm12 ; y == vb_pos + pxor xm13, xm11 ; y != vb_pos + vpbroadcastd m13, xm13 vpbroadcastd m14, [dw3] pblendvb m14, m14, [dd2], m13 ; ac -- 2.52.0 >From ebfdc406116edc2271a7e50528123e04bfba71eb Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Sun, 1 Mar 2026 03:45:47 +0100 Subject: [PATCH 23/28] avcodec/x86/vvc/alf: Improve deriving ac Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index cd4de6185d..e924308cff 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -45,7 +45,6 @@ ARG_VAR_SHUFFE: times 2 db 0, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4 cextern pd_64 dd448: times 8 dd 512 - 64 -dd2: times 8 dd 2 dw3: times 8 dd 3 dw5: times 8 dd 5 dd15: times 8 dd 15 @@ -696,7 +695,7 @@ cglobal vvc_alf_classify_grad_%1bpc, 6, 14, 16, gradient_sum, src, src_stride, w vpbroadcastd m13, xm13 vpbroadcastd m14, [dw3] - pblendvb m14, m14, [dd2], m13 ; ac + paddd m14, m13 ; ac = (y != vb_pos) ? 2 : 3 pblendvb m3, m15, [gradq + sum_stride3q], m13 -- 2.52.0 >From 929f1c5db9cc635374afd2ee475ba711291f203d Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 2 Mar 2026 17:20:31 +0100 Subject: [PATCH 24/28] avcodec/x86/vvc/alf: Remove pointless counter, stride Each luma alf block has 2*12 auxiliary coefficients associated with it that the alf_filter functions consume; the C version simply increments the pointers. The x64 dsp function meanwhile does things differenty: The vvc_alf_filter functions have three levels of loops. The middle layer uses two counters, one of which is just the horizontal offset xd in the current line. It is only used for addressing these auxiliary coefficients and yet one needs to perform work translate from it to the coefficient offset, namely a *3 via lea and a *2 scale. Furthermore, the base pointers of the coefficients are incremented in the outer loop; the stride used for this is calculated in the C wrapper functions. Furthermore, due to GPR pressure xd is reused as loop counter for the innermost loop; the xd from the middle loop is pushed to the stack. Apart from the translation from horizontal offset to coefficient offset all of the above has been done for chroma, too, although the coefficient pointers don't get modified for them at all. This commit changes this to just increment the pointers after reading the relevant coefficients. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 36 +++++++++++------------------------ libavcodec/x86/vvc/dsp_init.c | 9 ++++----- 2 files changed, 15 insertions(+), 30 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index e924308cff..df2f782683 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -59,9 +59,12 @@ SECTION .text ;%1-%3 out ;%4 clip or filter %macro LOAD_LUMA_PARAMS 4 - movu m%1, [%4q + 2 * offsetq + 0 * mmsize] ; 2 * for sizeof(int16_t) - movu m%2, [%4q + 2 * offsetq + 1 * mmsize] - movu m%3, [%4q + 2 * offsetq + 2 * mmsize] + movu m%1, [%4q + 0 * mmsize] + movu m%2, [%4q + 1 * mmsize] + movu m%3, [%4q + 2 * mmsize] + ; we process mmsize/(2*ALF_BLOCK_SIZE) alf blocks, + ; consuming ALF_NUM_COEFF_LUMA int16_t coeffs per alf block + add %4q, 3 * mmsize %endmacro %macro LOAD_LUMA_PARAMS_W16 6 @@ -113,7 +116,6 @@ SECTION .text %macro LOAD_PARAMS 0 %if LUMA - lea offsetq, [3 * xq] ;xq * ALF_NUM_COEFF_LUMA / ALF_BLOCK_SIZE LOAD_LUMA_PARAMS 3, 4, 5, filter, 6, 7 LOAD_LUMA_PARAMS 6, 7, 8, clip, 9, 10 %else @@ -401,18 +403,10 @@ SECTION .text %macro FILTER_16x4 1 %if LUMA push clipq - push strideq - %define s1q clipq - %define s2q strideq -%else - %define s1q s5q - %define s2q s6q + %define s5q clipq + %define s6q pixel_maxq %endif - %define s3q pixel_maxq - %define s4q offsetq - push xq - xor xd, xd %%filter_16x4_loop: LOAD_PIXELS m2, [srcq] ;p0 @@ -442,10 +436,7 @@ SECTION .text neg xq lea dstq, [dstq + xq * 4] - pop xq - %if LUMA - pop strideq pop clipq %endif %endmacro @@ -463,10 +454,10 @@ SECTION .text ; ****************************** ; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, -; const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); +; const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, stride, vb_pos, pixel_max, \ - offset, x, s5, s6 +cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ + x, s1, s2, s3, s4 %if !LUMA ; chroma does not use registers m5 and m8. Swap them to reduce the amount ; of nonvolatile registers on Win64. It also reduces codesize generally @@ -489,7 +480,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s push srcq push dstq push widthq - xor xd, xd .loop_w: cmp widthd, 16 @@ -500,7 +490,6 @@ cglobal vvc_alf_filter_%2_%1bpc, 11, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_s add srcq, 16 * ps add dstq, 16 * ps - add xd, 16 sub widthd, 16 jmp .loop_w @@ -525,9 +514,6 @@ INIT_YMM cpuname lea srcq, [srcq + 4 * src_strideq] lea dstq, [dstq + 4 * dst_strideq] - lea filterq, [filterq + 2 * strideq] - lea clipq, [clipq + 2 * strideq] - sub vb_posd, 4 sub heightd, 4 jg .loop diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 158308fb33..5194ecfdeb 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -60,10 +60,10 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, #define ALF_BPC_PROTOTYPES(bpc, opt) \ void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ + const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ + const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \ const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \ void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \ @@ -153,15 +153,14 @@ FW_PUT_16BPC_AVX2(12) static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ { \ - const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \ BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, param_stride, vb_pos, (1 << bd) - 1); \ + filter, clip, vb_pos, (1 << bd) - 1); \ } \ static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ { \ BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, 0, vb_pos,(1 << bd) - 1); \ + filter, clip, vb_pos,(1 << bd) - 1); \ } \ static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \ -- 2.52.0 >From 0a579c6cc51634f93b8fb0661f5dfc229164e4e2 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 2 Mar 2026 20:39:02 +0100 Subject: [PATCH 25/28] avcodec/x86/vvc/alf: Don't modify rsp unnecessarily The vvc_alf_filter functions don't use x86inc's stack managment feature at all; they merely push and pop some regs themselves. So don't tell x86inc to provide stack (which in this case entails aligning the stack). Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index df2f782683..429adec861 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -456,7 +456,7 @@ SECTION .text ; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, ; const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, 0-0x30, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ +cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ x, s1, s2, s3, s4 %if !LUMA ; chroma does not use registers m5 and m8. Swap them to reduce the amount -- 2.52.0 >From c6ee6a8257af0b029bc48ec56251c6762145ddf5 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Mon, 2 Mar 2026 21:38:29 +0100 Subject: [PATCH 26/28] avcodec/x86/vvc/alf: Improve offsetting pointers It can be combined with an earlier lea for the loop processing 16 pixels at a time; it is unnecessary for the tail, because the new values will be overwritten immediately afterwards anyway. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index 429adec861..b7e9c54b68 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -400,7 +400,7 @@ SECTION .text %endif %endmacro -%macro FILTER_16x4 1 +%macro FILTER_16x4 2 %if LUMA push clipq %define s5q clipq @@ -429,12 +429,14 @@ SECTION .text cmp xd, 4 jl %%filter_16x4_loop +%ifnidn %2, 0 mov xq, src_strideq neg xq - lea srcq, [srcq + xq * 4] + lea srcq, [srcq + xq * 4 + %2] mov xq, dst_strideq neg xq - lea dstq, [dstq + xq * 4] + lea dstq, [dstq + xq * 4 + %2] +%endif %if LUMA pop clipq @@ -486,10 +488,8 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s jl .loop_w_end LOAD_PARAMS - FILTER_16x4 16 + FILTER_16x4 16, 16 * ps - add srcq, 16 * ps - add dstq, 16 * ps sub widthd, 16 jmp .loop_w @@ -501,7 +501,7 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s INIT_XMM cpuname %endif LOAD_PARAMS - FILTER_16x4 widthd + FILTER_16x4 widthd, 0 %if LUMA INIT_YMM cpuname %endif -- 2.52.0 >From aabecbe0f40994d270925a4573eb4529887eb2b1 Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 3 Mar 2026 01:09:26 +0100 Subject: [PATCH 27/28] avcodec/x86/vvc/alf: Avoid pointless wrappers for alf_filter They are completely unnecessary for the 8bit case (which only handles 8bit) and overtly complicated for the 10 and 12bit cases: All one needs to do is set up the (1<<bpp)-1 vector register and jmp from (say) the 12bpp function stub inside the 10bpp function. The way it is done here even allows to share the prologue between the two functions. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 53 ++++++++++++++++++++++------------- libavcodec/x86/vvc/dsp_init.c | 38 +++++++++---------------- 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index b7e9c54b68..dd3652843e 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -403,8 +403,7 @@ SECTION .text %macro FILTER_16x4 2 %if LUMA push clipq - %define s5q clipq - %define s6q pixel_maxq + %define s6q clipq %endif xor xd, xd @@ -443,23 +442,21 @@ SECTION .text %endif %endmacro -; FILTER(bpc, luma/chroma) -%macro ALF_FILTER 2 -%xdefine BPC %1 +; FILTER(bd, luma/chroma, bd of implementation to use) +%macro ALF_FILTER 3 %ifidn %2, luma %xdefine LUMA 1 %else %xdefine LUMA 0 %endif -%define ps (%1 / 8) ; pixel size +%assign ps (%1+7) / 8 ; pixel size ; ****************************** -; void vvc_alf_filter_%2_%1bpc_avx2(uint8_t *dst, ptrdiff_t dst_stride, -; const uint8_t *src, ptrdiff_t src_stride, const ptrdiff_t width, cosnt ptr_diff_t height, -; const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); +; void ff_vvc_alf_filter_%2_%1_avx2(uint8_t *dst, ptrdiff_t dst_stride, +; const uint8_t *src, ptrdiff_t src_stride, int width, int height, +; const int16_t *filter, const int16_t *clip, int vb_pos); ; ****************************** -cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, pixel_max, \ - x, s1, s2, s3, s4 +cglobal vvc_alf_filter_%2_%1 %if !LUMA ; chroma does not use registers m5 and m8. Swap them to reduce the amount ; of nonvolatile registers on Win64. It also reduces codesize generally @@ -471,10 +468,24 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s SWAP 5,12 SWAP 8,13 %endif +%elif WIN64 && (ps != 1) +; Swap m5 and m15, so that the register for the maximum pixel value +; ends up in a volatile register + SWAP 5,15 %endif %if ps != 1 - movd xm15, pixel_maxd - vpbroadcastw m15, xm15 + ; create pw_pixelmax for clipping + pcmpeqw m15, m15 + psrlw m15, 16 - %1 +%endif + +%if %1 != %3 + jmp vvc_alf_filter_%2_%3_prologue +%else +vvc_alf_filter_%2_%1_prologue: + PROLOGUE 9, 14+LUMA, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, src, src_stride, width, height, filter, clip, vb_pos, \ + x, s1, s2, s3, s4, s5 +%if ps != 1 pxor m14, m14 %endif @@ -498,7 +509,9 @@ cglobal vvc_alf_filter_%2_%1bpc, 10, 15, 12+2*(ps!=1)+2*LUMA, dst, dst_stride, s je .w_end %if LUMA +SAVE_MM_PERMUTATION INIT_XMM cpuname +LOAD_MM_PERMUTATION %endif LOAD_PARAMS FILTER_16x4 widthd, 0 @@ -518,12 +531,13 @@ INIT_YMM cpuname sub heightd, 4 jg .loop RET +%endif %endmacro -; FILTER(bpc) -%macro ALF_FILTER 1 - ALF_FILTER %1, luma - ALF_FILTER %1, chroma +; FILTER(bd, bd of implementation to use) +%macro ALF_FILTER 2 + ALF_FILTER %1, luma, %2 + ALF_FILTER %1, chroma, %2 %endmacro %define ALF_GRADIENT_BORDER 2 @@ -891,9 +905,10 @@ cglobal vvc_alf_classify_%1bpc, 7, 15, 16, class_idx, transpose_idx, gradient_su %if ARCH_X86_64 %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 -ALF_FILTER 16 -ALF_FILTER 8 +ALF_FILTER 12, 10 +ALF_FILTER 10, 10 ALF_CLASSIFY 16 +ALF_FILTER 8, 8 ALF_CLASSIFY 8 %endif %endif diff --git a/libavcodec/x86/vvc/dsp_init.c b/libavcodec/x86/vvc/dsp_init.c index 5194ecfdeb..6802294795 100644 --- a/libavcodec/x86/vvc/dsp_init.c +++ b/libavcodec/x86/vvc/dsp_init.c @@ -58,12 +58,6 @@ void ff_vvc_apply_bdof_## BD ## _ ## OPT(uint8_t *dst, ptrdiff_t dst_stride, } while (0) #define ALF_BPC_PROTOTYPES(bpc, opt) \ -void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ -void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ - const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \ - const int16_t *filter, const int16_t *clip, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \ void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \ const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \ void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \ @@ -150,18 +144,6 @@ FW_PUT_16BPC_AVX2(10) FW_PUT_16BPC_AVX2(12) #define ALF_FUNCS(bpc, bd, opt) \ -static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ - int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ -{ \ - BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, vb_pos, (1 << bd) - 1); \ -} \ -static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \ - int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \ -{ \ - BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \ - filter, clip, vb_pos,(1 << bd) - 1); \ -} \ static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \ const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \ { \ @@ -298,10 +280,16 @@ void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h); #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2 -#define ALF_INIT(bd) do { \ - c->alf.filter[LUMA] = vvc_alf_filter_luma_##bd##_avx2; \ - c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2; \ - c->alf.classify = vvc_alf_classify_##bd##_avx2; \ +#define ALF_INIT(bd, opt) do { \ +void bf(ff_vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, \ + const int16_t *filter, const int16_t *clip, int vb_pos); \ +void bf(ff_vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \ + const uint8_t *src, ptrdiff_t src_stride, int width, int height, \ + const int16_t *filter, const int16_t *clip, int vb_pos); \ + c->alf.filter[LUMA] = bf(ff_vvc_alf_filter_luma, bd, opt); \ + c->alf.filter[CHROMA] = bf(ff_vvc_alf_filter_chroma, bd, opt); \ + c->alf.classify = bf(vvc_alf_classify, bd, opt); \ } while (0) #endif @@ -331,7 +319,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(8); + ALF_INIT(8, avx2); SAO_INIT(8, avx2); } #endif @@ -353,7 +341,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(10); + ALF_INIT(10, avx2); SAO_INIT(10, avx2); } #endif @@ -375,7 +363,7 @@ av_cold void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd) SAD_INIT(); // filter - ALF_INIT(12); + ALF_INIT(12, avx2); SAO_INIT(12, avx2); } #endif -- 2.52.0 >From 01a3573ec973426ea2375c2939a422ebcf9be0bc Mon Sep 17 00:00:00 2001 From: Andreas Rheinhardt <[email protected]> Date: Tue, 3 Mar 2026 02:42:58 +0100 Subject: [PATCH 28/28] avcodec/x86/vvc/alf: Simplify vb_pos comparisons The value of vb_pos at vb_bottom, vb_above is known at compile-time, so one can avoid the modifications to vb_pos and just compare against immediates. Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vvc/alf.asm | 67 +++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 33 deletions(-) diff --git a/libavcodec/x86/vvc/alf.asm b/libavcodec/x86/vvc/alf.asm index dd3652843e..dfa9de2a97 100644 --- a/libavcodec/x86/vvc/alf.asm +++ b/libavcodec/x86/vvc/alf.asm @@ -192,64 +192,65 @@ SECTION .text je %%vb_bottom cmp vb_posd, 4 jne %%vb_end -%else - cmp vb_posd, 2 - jne %%vb_end - cmp %1, 2 - jge %%vb_bottom -%endif - %%vb_above: - ; above + ; above: vb_pos == 4 ; p1 = (y + i == vb_pos - 1) ? p0 : p1; ; p2 = (y + i == vb_pos - 1) ? p0 : p2; ; p3 = (y + i >= vb_pos - 2) ? p1 : p3; ; p4 = (y + i >= vb_pos - 2) ? p2 : p4; ; p5 = (y + i >= vb_pos - 3) ? p3 : p5; ; p6 = (y + i >= vb_pos - 3) ? p4 : p6; - dec vb_posd - cmp vb_posd, %1 + cmp %1, 3 cmove s1q, srcq cmove s2q, srcq - dec vb_posd - cmp vb_posd, %1 - cmovbe s3q, s1q - cmovbe s4q, s2q + cmp %1, 1 + cmova s3q, s1q + cmova s4q, s2q - dec vb_posd -%if LUMA - cmp vb_posd, %1 - cmovbe s5q, s3q - cmovbe s6q, s4q -%endif - add vb_posd, 3 + cmovae s5q, s3q + cmovae s6q, s4q jmp %%vb_end %%vb_bottom: - ; bottom + ; bottom: vb_pos == 0 ; p1 = (y + i == vb_pos ) ? p0 : p1; ; p2 = (y + i == vb_pos ) ? p0 : p2; ; p3 = (y + i <= vb_pos + 1) ? p1 : p3; ; p4 = (y + i <= vb_pos + 1) ? p2 : p4; ; p5 = (y + i <= vb_pos + 2) ? p3 : p5; ; p6 = (y + i <= vb_pos + 2) ? p4 : p6; - cmp vb_posd, %1 + cmp %1, 0 cmove s1q, srcq cmove s2q, srcq - inc vb_posd - cmp vb_posd, %1 - cmovae s3q, s1q - cmovae s4q, s2q + cmp %1, 2 + cmovb s3q, s1q + cmovb s4q, s2q - inc vb_posd -%if LUMA - cmp vb_posd, %1 - cmovae s5q, s3q - cmovae s6q, s4q + cmovbe s5q, s3q + cmovbe s6q, s4q +%else ; chroma + cmp vb_posd, 2 + jne %%vb_end + cmp %1, 2 + jge %%vb_bottom +%%vb_above: + cmp %1, 1 + cmove s1q, srcq + cmove s2q, srcq + + mov s3q, s1q + mov s4q, s2q + jmp %%vb_end + +%%vb_bottom: + cmove s1q, srcq + cmove s2q, srcq + + mov s3q, s1q + mov s4q, s2q %endif - sub vb_posd, 2 %%vb_end: %endmacro -- 2.52.0 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
