Hi Zhili, Good job. Appreciate it. With this patch, we're very close to smooth 4K@30 playback on my M2.
On Tue, Jul 16, 2024 at 12:19 AM Zhao Zhili <quinkbl...@foxmail.com> wrote: > From: Zhao Zhili <zhiliz...@tencent.com> > > vvc_alf_filter_chroma_4x4_8_c: 3.0 > vvc_alf_filter_chroma_4x4_8_neon: 1.0 > vvc_alf_filter_chroma_4x4_10_c: 2.7 > vvc_alf_filter_chroma_4x4_10_neon: 1.0 > vvc_alf_filter_chroma_4x4_12_c: 2.7 > vvc_alf_filter_chroma_4x4_12_neon: 1.0 > vvc_alf_filter_chroma_8x8_8_c: 10.2 > vvc_alf_filter_chroma_8x8_8_neon: 3.0 > vvc_alf_filter_chroma_8x8_10_c: 10.0 > vvc_alf_filter_chroma_8x8_10_neon: 2.5 > vvc_alf_filter_chroma_8x8_12_c: 10.0 > vvc_alf_filter_chroma_8x8_12_neon: 2.5 > vvc_alf_filter_chroma_16x16_8_c: 41.7 > vvc_alf_filter_chroma_16x16_8_neon: 11.2 > vvc_alf_filter_chroma_16x16_10_c: 39.0 > vvc_alf_filter_chroma_16x16_10_neon: 10.0 > vvc_alf_filter_chroma_16x16_12_c: 40.2 > vvc_alf_filter_chroma_16x16_12_neon: 10.2 > vvc_alf_filter_chroma_32x32_8_c: 162.0 > vvc_alf_filter_chroma_32x32_8_neon: 45.0 > vvc_alf_filter_chroma_32x32_10_c: 155.5 > vvc_alf_filter_chroma_32x32_10_neon: 39.5 > vvc_alf_filter_chroma_32x32_12_c: 155.5 > vvc_alf_filter_chroma_32x32_12_neon: 40.0 > vvc_alf_filter_chroma_64x64_8_c: 646.0 > vvc_alf_filter_chroma_64x64_8_neon: 175.5 > vvc_alf_filter_chroma_64x64_10_c: 708.2 > vvc_alf_filter_chroma_64x64_10_neon: 166.7 > vvc_alf_filter_chroma_64x64_12_c: 619.2 > vvc_alf_filter_chroma_64x64_12_neon: 157.2 > vvc_alf_filter_chroma_128x128_8_c: 2611.5 > vvc_alf_filter_chroma_128x128_8_neon: 698.2 > vvc_alf_filter_chroma_128x128_10_c: 2470.0 > vvc_alf_filter_chroma_128x128_10_neon: 616.0 > vvc_alf_filter_chroma_128x128_12_c: 2531.5 > vvc_alf_filter_chroma_128x128_12_neon: 620.2 > vvc_alf_filter_luma_8x8_8_c: 25.2 > vvc_alf_filter_luma_8x8_8_neon: 4.2 > vvc_alf_filter_luma_8x8_10_c: 18.5 > vvc_alf_filter_luma_8x8_10_neon: 4.0 > vvc_alf_filter_luma_8x8_12_c: 19.0 > vvc_alf_filter_luma_8x8_12_neon: 4.0 > vvc_alf_filter_luma_16x16_8_c: 106.5 > vvc_alf_filter_luma_16x16_8_neon: 16.2 > vvc_alf_filter_luma_16x16_10_c: 75.2 > vvc_alf_filter_luma_16x16_10_neon: 14.7 > vvc_alf_filter_luma_16x16_12_c: 79.7 > vvc_alf_filter_luma_16x16_12_neon: 14.7 > vvc_alf_filter_luma_32x32_8_c: 400.5 > vvc_alf_filter_luma_32x32_8_neon: 63.2 > vvc_alf_filter_luma_32x32_10_c: 299.2 > vvc_alf_filter_luma_32x32_10_neon: 57.7 > vvc_alf_filter_luma_32x32_12_c: 299.2 > vvc_alf_filter_luma_32x32_12_neon: 57.7 > vvc_alf_filter_luma_64x64_8_c: 1602.5 > vvc_alf_filter_luma_64x64_8_neon: 251.7 > vvc_alf_filter_luma_64x64_10_c: 1197.0 > vvc_alf_filter_luma_64x64_10_neon: 235.5 > vvc_alf_filter_luma_64x64_12_c: 1220.2 > vvc_alf_filter_luma_64x64_12_neon: 235.7 > vvc_alf_filter_luma_128x128_8_c: 6570.2 > vvc_alf_filter_luma_128x128_8_neon: 1007.7 > vvc_alf_filter_luma_128x128_10_c: 4822.7 > vvc_alf_filter_luma_128x128_10_neon: 936.2 > vvc_alf_filter_luma_128x128_12_c: 4791.2 > vvc_alf_filter_luma_128x128_12_neon: 938.5 > > Signed-off-by: Zhao Zhili <zhiliz...@tencent.com> > --- > libavcodec/aarch64/vvc/Makefile | 5 + > libavcodec/aarch64/vvc/alf.S | 293 ++++++++++++++++++++++++++ > libavcodec/aarch64/vvc/alf_template.c | 157 ++++++++++++++ > libavcodec/aarch64/vvc/dsp_init.c | 57 +++++ > libavcodec/vvc/dsp.c | 4 +- > libavcodec/vvc/dsp.h | 1 + > 6 files changed, 516 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/aarch64/vvc/Makefile > create mode 100644 libavcodec/aarch64/vvc/alf.S > create mode 100644 libavcodec/aarch64/vvc/alf_template.c > create mode 100644 libavcodec/aarch64/vvc/dsp_init.c > > diff --git a/libavcodec/aarch64/vvc/Makefile > b/libavcodec/aarch64/vvc/Makefile > new file mode 100644 > index 0000000000..58398d6e3d > --- /dev/null > +++ b/libavcodec/aarch64/vvc/Makefile > @@ -0,0 +1,5 @@ > +clean:: > + $(RM) $(CLEANSUFFIXES:%=libavcodec/aarch64/vvc/%) > + > +OBJS-$(CONFIG_VVC_DECODER) += > aarch64/vvc/dsp_init.o > +NEON-OBJS-$(CONFIG_VVC_DECODER) += > aarch64/vvc/alf.o > diff --git a/libavcodec/aarch64/vvc/alf.S b/libavcodec/aarch64/vvc/alf.S > new file mode 100644 > index 0000000000..beb36ac66b > --- /dev/null > +++ b/libavcodec/aarch64/vvc/alf.S > @@ -0,0 +1,293 @@ > +/* > + * Copyright (c) 2024 Zhao Zhili <quinkbl...@foxmail.com> > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/aarch64/asm.S" > + > +.macro alf_luma_filter_pixel index, pix_size, addr1, addr2, offset1, > offset2 > + .if \pix_size == 1 > + ldur d3, [\addr1, #\offset1] > + ldur d4, [\addr2, #\offset2] > + uxtl v6.8h, v3.8b > + uxtl v7.8h, v4.8b > + .else > + ldur q6, [\addr1, #(2*\offset1)] > + ldur q7, [\addr2, #(2*\offset2)] > + .endif > + .if \index < 8 > + dup v17.4h, v0.h[\index] // clip > + dup v18.4h, v16.h[\index] // -clip > + dup v19.4h, v1.h[\index] // filter > + > + dup v26.4h, v22.h[\index] // clip > + dup v27.4h, v23.h[\index] // -clip > + dup v28.4h, v24.h[\index] // filter > + .else > + dup v17.4h, v0.h[\index - 8] // clip > + dup v18.4h, v16.h[\index - 8] // -clip > + dup v19.4h, v1.h[\index - 8] // filter > + > + dup v26.4h, v22.h[\index - 8] // clip > + dup v27.4h, v23.h[\index - 8] // -clip > + dup v28.4h, v24.h[\index - 8] // filter > + .endif > + ins v17.d[1], v26.d[0] > + ins v18.d[1], v27.d[0] > + ins v19.d[1], v28.d[0] > + > + sub v6.8h, v6.8h, v5.8h > + sub v7.8h, v7.8h, v5.8h > + smin v6.8h, v6.8h, v17.8h > + smin v7.8h, v7.8h, v17.8h > + smax v6.8h, v6.8h, v18.8h > + smax v7.8h, v7.8h, v18.8h > + add v6.8h, v6.8h, v7.8h > + smlal v20.4s, v19.4h, v6.4h // v20: sum > + smlal2 v21.4s, v19.8h, v6.8h // v21: sum > +.endm > + > +/* x0: dst > + * x1: pp > + * x2: filter > + * x3: clip > + * w4: is_near_vb > + * w5: pix_max > + */ > +.macro alf_filter_luma_kernel, pix_size > + dst .req x0 > + pp .req x1 > + filter .req x2 > + clip .req x3 > + is_near_vb .req w4 > + pix_max .req w5 > + .if \pix_size > 1 > + dup v25.8h, pix_max // pix_max > + .endif > + ldr q0, [clip] // clip > + ldr q1, [filter] // filter > + ldr q22, [clip, #24] // clip > + ldr q24, [filter, #24] // filter > + > + ldr x5, [pp] // x5: p0 > + ldr x6, [pp, #(5*8)] // x6: p5 > + ldr x7, [pp, #(6*8)] // x7: p6 > + neg v16.8h, v0.8h // -clip > + neg v23.8h, v22.8h // -clip > + > + .if \pix_size == 1 > + ldr d2, [x5] // curr > + .else > + ldr q5, [x5] // curr > + .endif > + movi v20.4s, 64 > + cbz is_near_vb, 1f > + shl v20.4s, v20.4s, #3 > +1: > + .if \pix_size == 1 > + uxtl v5.8h, v2.8b > + .endif > + mov v21.16b, v20.16b > + ldr x8, [pp, #(3*8)] // p3 > + ldr x9, [pp, #(4*8)] // p4 > + alf_luma_filter_pixel 0, \pix_size, x6, x7, 0, 0 > + > + ldr x6, [pp, #(1*8)] // p1 > + ldr x7, [pp, #(2*8)] // p2 > + alf_luma_filter_pixel 1, \pix_size, x8, x9, 1, -1 > + alf_luma_filter_pixel 2, \pix_size, x8, x9, 0, 0 > + alf_luma_filter_pixel 3, \pix_size, x8, x9, -1, 1 > + > + alf_luma_filter_pixel 4, \pix_size, x6, x7, 2, -2 > + alf_luma_filter_pixel 5, \pix_size, x6, x7, 1, -1 > + alf_luma_filter_pixel 6, \pix_size, x6, x7, 0, 0 > + alf_luma_filter_pixel 7, \pix_size, x6, x7, -1, 1 > + > + ldr d0, [clip, #16] // clip > + ldr d1, [filter, #16] // filter > + neg v16.4h, v0.4h // -clip > + > + ldr d22, [clip, #40] // clip > + ldr d24, [filter, #40] // filter > + neg v23.4h, v22.4h // -clip > + alf_luma_filter_pixel 8, \pix_size, x6, x7, -2, 2 > + alf_luma_filter_pixel 9, \pix_size, x5, x5, 3, -3 > + alf_luma_filter_pixel 10, \pix_size, x5, x5, 2, -2 > + alf_luma_filter_pixel 11, \pix_size, x5, x5, 1, -1 > + > + cbz is_near_vb, 2f > + sshr v20.4s, v20.4s, #10 > + sshr v21.4s, v21.4s, #10 > + b 3f > +2: > + sshr v20.4s, v20.4s, #7 > + sshr v21.4s, v21.4s, #7 > +3: > + uxtl v22.4s, v5.4h > + uxtl2 v23.4s, v5.8h > + add v20.4s, v20.4s, v22.4s > + add v21.4s, v21.4s, v23.4s > + sqxtun v20.4h, v20.4s > + sqxtun2 v20.8h, v21.4s > + .if \pix_size == 1 > + sqxtun v20.8b, v20.8h > + str d20, [dst] > + .else > + smin v20.8h, v20.8h, v25.8h > + str q20, [dst] > + .endif > + ret > + > + .unreq dst > + .unreq pp > + .unreq filter > + .unreq clip > + .unreq is_near_vb > + .unreq pix_max > +.endm > + > +.macro alf_chroma_filter_pixel index, pix_size, addr1, addr2, offset1, > offset2 > + .if \pix_size == 1 > + ldur s3, [\addr1, #\offset1] > + ldur s4, [\addr2, #\offset2] > + uxtl v6.8h, v3.8b > + uxtl v7.8h, v4.8b > + .else > + ldur d6, [\addr1, #(2*\offset1)] > + ldur d7, [\addr2, #(2*\offset2)] > + .endif > + .if \index < 8 > + dup v17.4h, v0.h[\index] // v17: clip[0] > + dup v18.4h, v16.h[\index] // v18: -clip[0] > + dup v19.4h, v1.h[\index] // v19: filter[0] > + .else > + dup v17.4h, v0.h[\index - 8] // v17: clip[0] > + dup v18.4h, v16.h[\index - 8] // v18: -clip[0] > + dup v19.4h, v1.h[\index - 8] // v19: filter[0] > + .endif > + > + sub v6.4h, v6.4h, v5.4h > + sub v7.4h, v7.4h, v5.4h > + smin v6.4h, v6.4h, v17.4h > + smin v7.4h, v7.4h, v17.4h > + smax v6.4h, v6.4h, v18.4h > + smax v7.4h, v7.4h, v18.4h > + add v6.4h, v6.4h, v7.4h > + smlal v20.4s, v19.4h, v6.4h // v20: sum > +.endm > + > +/* x0: dst > + * x1: pp > + * x2: filter > + * x3: clip > + * w4: is_near_vb > + * w5: pix_max > + */ > +.macro alf_filter_chroma_kernel, pix_size > + dst .req x0 > + pp .req x1 > + filter .req x2 > + clip .req x3 > + is_near_vb .req w4 > + pix_max .req w5 > + .if \pix_size > 1 > + dup v25.4h, pix_max // pix_max > + .endif > + ldr q0, [clip] // clip > + ldr q1, [filter] // filter > + ldr x5, [pp] // p0 > + ldr x6, [pp, #(3*8)] // p3 > + ldr x7, [pp, #(4*8)] // p4 > + neg v16.8h, v0.8h // -clip > + > + .if \pix_size == 1 > + ldr s2, [x5] // curr > + .else > + ldr d5, [x5] // curr > + .endif > + movi v20.4s, 64 > + cbz is_near_vb, 1f > + shl v20.4s, v20.4s, #3 > +1: > + .if \pix_size == 1 > + uxtl v5.8h, v2.8b > + .endif > + ldr x8, [pp, #(1*8)] // p1 > + ldr x9, [pp, #(2*8)] // p2 > + alf_chroma_filter_pixel 0, \pix_size, x6, x7, 0, 0 > + alf_chroma_filter_pixel 1, \pix_size, x8, x9, 1, -1 > + alf_chroma_filter_pixel 2, \pix_size, x8, x9, 0, 0 > + alf_chroma_filter_pixel 3, \pix_size, x8, x9, -1, 1 > + alf_chroma_filter_pixel 4, \pix_size, x5, x5, 2, -2 > + alf_chroma_filter_pixel 5, \pix_size, x5, x5, 1, -1 > + > + uxtl v22.4s, v5.4h > + cbz is_near_vb, 2f > + sshr v20.4s, v20.4s, #10 > + b 3f > +2: > + sshr v20.4s, v20.4s, #7 > +3: > + add v20.4s, v20.4s, v22.4s > + sqxtun v20.4h, v20.4s > + .if \pix_size == 1 > + sqxtun v20.8b, v20.8h > + str s20, [dst] > + .else > + smin v20.4h, v20.4h, v25.4h > + str d20, [dst] > + .endif > + ret > + > + .unreq dst > + .unreq pp > + .unreq filter > + .unreq clip > + .unreq is_near_vb > + .unreq pix_max > +.endm > + > +function ff_alf_filter_luma_kernel_8_neon, export=1 > + alf_filter_luma_kernel 1 > +endfunc > + > +function ff_alf_filter_luma_kernel_12_neon, export=1 > + mov w5, 4095 > + b 1f > +endfunc > + > +function ff_alf_filter_luma_kernel_10_neon, export=1 > + mov w5, 1023 > +1: > + alf_filter_luma_kernel 2 > +endfunc > + > +function ff_alf_filter_chroma_kernel_8_neon, export=1 > + alf_filter_chroma_kernel 1 > +endfunc > + > +function ff_alf_filter_chroma_kernel_12_neon, export=1 > + mov w5, 4095 > + b 1f > +endfunc > + > +function ff_alf_filter_chroma_kernel_10_neon, export=1 > + mov w5, 1023 > +1: > + alf_filter_chroma_kernel 2 > +endfunc > diff --git a/libavcodec/aarch64/vvc/alf_template.c > b/libavcodec/aarch64/vvc/alf_template.c > new file mode 100644 > index 0000000000..b1927966d2 > --- /dev/null > +++ b/libavcodec/aarch64/vvc/alf_template.c > @@ -0,0 +1,157 @@ > +/* > + * VVC filters DSP > + * > + * Copyright (C) 2024 Zhao Zhili > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavcodec/bit_depth_template.c" > + > +void FUNC2(ff_alf_filter_luma_kernel, BIT_DEPTH, _neon)(pixel *dst, > + const pixel **p, > + const int16_t *filter, > + const int16_t *clip, > + int is_near_vb); > + > +void FUNC2(ff_alf_filter_chroma_kernel, BIT_DEPTH, _neon)(pixel *dst, > + const pixel **p, > + const int16_t *filter, > + const int16_t *clip, > + int is_near_vb); > + > +static void FUNC2(alf_filter_luma, BIT_DEPTH, _neon)(uint8_t *_dst, > + ptrdiff_t dst_stride, > + const uint8_t *_src, > + ptrdiff_t src_stride, > + const int width, const int > height, > + const int16_t *filter, > + const int16_t *clip, > + const int vb_pos) > +{ > + const pixel *src = (pixel *)_src; > + > + dst_stride /= sizeof(pixel); > + src_stride /= sizeof(pixel); > + > + for (int y = 0; y < height; y += ALF_BLOCK_SIZE) { > + int far = (y + 3 < vb_pos - 3) || (y > vb_pos + 2); > + > + for (int x = 0; x < width; x += 2 * ALF_BLOCK_SIZE) { > + const pixel *s0 = src + y * src_stride + x; > + const pixel *s1 = s0 + src_stride; > + const pixel *s2 = s0 - src_stride; > + const pixel *s3 = s1 + src_stride; > + const pixel *s4 = s2 - src_stride; > + const pixel *s5 = s3 + src_stride; > + const pixel *s6 = s4 - src_stride; > + > + for (int i = 0; i < ALF_BLOCK_SIZE; i++) { > + pixel *dst = (pixel *) _dst + (y + i) * dst_stride + x; > + > + const pixel *p0 = s0 + i * src_stride; > + const pixel *p1 = s1 + i * src_stride; > + const pixel *p2 = s2 + i * src_stride; > + const pixel *p3 = s3 + i * src_stride; > + const pixel *p4 = s4 + i * src_stride; > + const pixel *p5 = s5 + i * src_stride; > + const pixel *p6 = s6 + i * src_stride; > + int is_near_vb = 0; > + > + if (!far) { > + is_near_vb = (y + i == vb_pos - 1) || (y + i == > vb_pos); > + if (is_near_vb) { > + p1 = p0; > + p2 = p0; > + } > + if (y + i >= vb_pos - 2 && y + i <= vb_pos + 1) { > + p3 = p1; > + p4 = p2; > + } > + if (y + i >= vb_pos - 3 && y + i <= vb_pos + 2) { > + p5 = p3; > + p6 = p4; > + } > + } > + FUNC2(ff_alf_filter_luma_kernel, BIT_DEPTH, _neon)(dst, > + (const pixel *[]) { p0, p1, p2, p3, p4, p5, > p6}, > + filter, > + clip, > + is_near_vb); > + } > + filter += 2 * ALF_NUM_COEFF_LUMA; > + clip += 2 * ALF_NUM_COEFF_LUMA; > + } > + } > +} > + > +static void FUNC2(alf_filter_chroma, BIT_DEPTH, _neon)(uint8_t *_dst, > + ptrdiff_t > dst_stride, > + const uint8_t > *_src, > + ptrdiff_t > src_stride, > + const int width, > + const int height, > + const int16_t > *filter, > + const int16_t > *clip, > + const int vb_pos) > +{ > + const pixel *src = (pixel *)_src; > + > + dst_stride /= sizeof(pixel); > + src_stride /= sizeof(pixel); > + > + for (int y = 0; y < height; y += ALF_BLOCK_SIZE) { > + int far = (y + 3 < vb_pos - 2) || (y > vb_pos + 1); > + > + for (int x = 0; x < width; x += ALF_BLOCK_SIZE) { > + const pixel *s0 = src + y * src_stride + x; > + const pixel *s1 = s0 + src_stride; > + const pixel *s2 = s0 - src_stride; > + const pixel *s3 = s1 + src_stride; > + const pixel *s4 = s2 - src_stride; > + > + for (int i = 0; i < ALF_BLOCK_SIZE; i++) { > + pixel *dst = (pixel *)_dst + (y + i) * dst_stride + x; > + > + const pixel *p0 = s0 + i * src_stride; > + const pixel *p1 = s1 + i * src_stride; > + const pixel *p2 = s2 + i * src_stride; > + const pixel *p3 = s3 + i * src_stride; > + const pixel *p4 = s4 + i * src_stride; > + int is_near_vb = 0; > + > + if (!far) { > + is_near_vb = (y + i == vb_pos - 1) || (y + i == > vb_pos); > + if (is_near_vb) { > + p1 = p0; > + p2 = p0; > + } > + > + if (y + i >= vb_pos - 2 && y + i <= vb_pos + 1) { > + p3 = p1; > + p4 = p2; > + } > + } > + > + FUNC2(ff_alf_filter_chroma_kernel, BIT_DEPTH, _neon)(dst, > + (const pixel *[]){p0, p1, p2, p3, p4}, > + filter, clip, > + is_near_vb); > + } > + } > + } > +} > \ No newline at end of file > diff --git a/libavcodec/aarch64/vvc/dsp_init.c > b/libavcodec/aarch64/vvc/dsp_init.c > new file mode 100644 > index 0000000000..926e16f70a > --- /dev/null > +++ b/libavcodec/aarch64/vvc/dsp_init.c > @@ -0,0 +1,57 @@ > +/* > + * VVC filters DSP > + * > + * Copyright (C) 2024 Zhao Zhili > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > + */ > + > +#include "libavutil/cpu.h" > +#include "libavutil/aarch64/cpu.h" > +#include "libavcodec/vvc/dsp.h" > +#include "libavcodec/vvc/dec.h" > +#include "libavcodec/vvc/ctu.h" > + > +#define BIT_DEPTH 8 > +#include "alf_template.c" > +#undef BIT_DEPTH > + > +#define BIT_DEPTH 10 > +#include "alf_template.c" > +#undef BIT_DEPTH > + > +#define BIT_DEPTH 12 > +#include "alf_template.c" > +#undef BIT_DEPTH > + > +void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd) > +{ > + int cpu_flags = av_get_cpu_flags(); > + if (!have_neon(cpu_flags)) > + return; > + > + if (bd == 8) { > + c->alf.filter[LUMA] = alf_filter_luma_8_neon; > + c->alf.filter[CHROMA] = alf_filter_chroma_8_neon; > + } else if (bd == 10) { > + c->alf.filter[LUMA] = alf_filter_luma_10_neon; > + c->alf.filter[CHROMA] = alf_filter_chroma_10_neon; > + } else if (bd == 12) { > + c->alf.filter[LUMA] = alf_filter_luma_12_neon; > + c->alf.filter[CHROMA] = alf_filter_chroma_12_neon; > + } > +} > \ No newline at end of file > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > index 41e830a98a..648d54ebb2 100644 > --- a/libavcodec/vvc/dsp.c > +++ b/libavcodec/vvc/dsp.c > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) > break; > } > > -#if ARCH_X86 > +#if ARCH_AARCH64 > + ff_vvc_dsp_init_aarch64(vvcdsp, bit_depth); > +#elif ARCH_X86 > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > #endif > } > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 1f14096c41..0b49b97021 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -180,6 +180,7 @@ typedef struct VVCDSPContext { > > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > +void ff_vvc_dsp_init_aarch64(VVCDSPContext *hpc, const int bit_depth); > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > #endif /* AVCODEC_VVC_DSP_H */ > -- > 2.42.0 > > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".