[FFmpeg-cvslog] avcodec/ac3dsp: add missing stddef.h include
ffmpeg | branch: master | James Almer | Fri Dec 1 12:42:22 2023 -0300| [6d196112516f5298f263eeb29a8a1626b6e090d4] | committer: James Almer avcodec/ac3dsp: add missing stddef.h include Should fix make checkheaders Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6d196112516f5298f263eeb29a8a1626b6e090d4 --- libavcodec/ac3dsp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h index ec2f598451..ae33b361a9 100644 --- a/libavcodec/ac3dsp.h +++ b/libavcodec/ac3dsp.h @@ -22,6 +22,7 @@ #ifndef AVCODEC_AC3DSP_H #define AVCODEC_AC3DSP_H +#include #include /** ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] checkasm/ac3dsp: add float_to_fixed24 test
ffmpeg | branch: master | sunyuechi | Wed Nov 22 14:57:29 2023 +0800| [d0ec826077c49f4cbf286621771a4a43a9bf57b8] | committer: Rémi Denis-Courmont checkasm/ac3dsp: add float_to_fixed24 test Signed-off-by: Rémi Denis-Courmont > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=d0ec826077c49f4cbf286621771a4a43a9bf57b8 --- tests/checkasm/Makefile | 1 + tests/checkasm/ac3dsp.c | 70 +++ tests/checkasm/checkasm.c | 3 ++ tests/checkasm/checkasm.h | 1 + tests/fate/checkasm.mak | 1 + 5 files changed, 76 insertions(+) diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile index 8bc241d29b..53742c93ae 100644 --- a/tests/checkasm/Makefile +++ b/tests/checkasm/Makefile @@ -1,5 +1,6 @@ # libavcodec tests # subsystems +AVCODECOBJS-$(CONFIG_AC3DSP)+= ac3dsp.o AVCODECOBJS-$(CONFIG_AUDIODSP) += audiodsp.o AVCODECOBJS-$(CONFIG_BLOCKDSP) += blockdsp.o AVCODECOBJS-$(CONFIG_BSWAPDSP) += bswapdsp.o diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c new file mode 100644 index 00..8f36f1736c --- /dev/null +++ b/tests/checkasm/ac3dsp.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with FFmpeg; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include + +#include "libavutil/mem.h" +#include "libavutil/mem_internal.h" + +#include "libavcodec/ac3dsp.h" + +#include "checkasm.h" + +#define randomize_float(buf, len) \ +do {\ +int i; \ +for (i = 0; i < len; i++) { \ +float f = (float)rnd() / (UINT_MAX >> 5) - 16.0f; \ +buf[i] = f; \ +} \ +} while (0) + +static void check_float_to_fixed24(AC3DSPContext *c) { +#define BUF_SIZE 1024 +LOCAL_ALIGNED_32(float, src, [BUF_SIZE]); + +declare_func(void, int32_t *, const float *, unsigned int); + +randomize_float(src, BUF_SIZE); + +if (check_func(c->float_to_fixed24, "float_to_fixed24")) { +LOCAL_ALIGNED_32(int32_t, dst, [BUF_SIZE]); +LOCAL_ALIGNED_32(int32_t, dst2, [BUF_SIZE]); + +call_ref(dst, src, BUF_SIZE); +call_new(dst2, src, BUF_SIZE); + +if (memcmp(dst, dst2, BUF_SIZE) != 0) +fail(); + +bench_new(dst, src, BUF_SIZE); +} + + +report("float_to_fixed24"); +} + +void checkasm_check_ac3dsp(void) +{ +AC3DSPContext c; +ff_ac3dsp_init(&c); + +check_float_to_fixed24(&c); +} diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c index a15e801caf..0a1285eca4 100644 --- a/tests/checkasm/checkasm.c +++ b/tests/checkasm/checkasm.c @@ -77,6 +77,9 @@ static const struct { { "aacpsdsp", checkasm_check_aacpsdsp }, { "sbrdsp", checkasm_check_sbrdsp }, #endif +#if CONFIG_AC3DSP +{ "ac3dsp", checkasm_check_ac3dsp }, +#endif #if CONFIG_ALAC_DECODER { "alacdsp", checkasm_check_alacdsp }, #endif diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h index 41093f2dca..11d2f7286f 100644 --- a/tests/checkasm/checkasm.h +++ b/tests/checkasm/checkasm.h @@ -44,6 +44,7 @@ #include "libavutil/timer.h" void checkasm_check_aacpsdsp(void); +void checkasm_check_ac3dsp(void); void checkasm_check_afir(void); void checkasm_check_alacdsp(void); void checkasm_check_audiodsp(void); diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak index 57b0dff4f2..b8ffa0a77e 100644 --- a/tests/fate/checkasm.mak +++ b/tests/fate/checkasm.mak @@ -1,4 +1,5 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp \ +fate-checkasm-ac3dsp\ fate-checkasm-af_afir \ fate-checkasm-alacdsp \ fate-checkasm-audiodsp \ ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinf
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels
ffmpeg | branch: master | Logan Lyu | Sun Nov 5 16:33:17 2023 +0800| [40cf4a5ca3ce1bdb2623ac0f8a956c27203540ea] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_pel_bi_pixels put_hevc_pel_bi_pixels4_8_c: 54.7 put_hevc_pel_bi_pixels4_8_neon: 43.0 put_hevc_pel_bi_pixels6_8_c: 94.7 put_hevc_pel_bi_pixels6_8_neon: 37.0 put_hevc_pel_bi_pixels8_8_c: 171.0 put_hevc_pel_bi_pixels8_8_neon: 24.0 put_hevc_pel_bi_pixels12_8_c: 354.0 put_hevc_pel_bi_pixels12_8_neon: 68.7 put_hevc_pel_bi_pixels16_8_c: 588.2 put_hevc_pel_bi_pixels16_8_neon: 77.5 put_hevc_pel_bi_pixels24_8_c: 1670.7 put_hevc_pel_bi_pixels24_8_neon: 173.0 put_hevc_pel_bi_pixels32_8_c: 2267.7 put_hevc_pel_bi_pixels32_8_neon: 281.2 put_hevc_pel_bi_pixels48_8_c: 5787.5 put_hevc_pel_bi_pixels48_8_neon: 673.5 put_hevc_pel_bi_pixels64_8_c: 9897.0 put_hevc_pel_bi_pixels64_8_neon: 1159.5 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=40cf4a5ca3ce1bdb2623ac0f8a956c27203540ea --- libavcodec/aarch64/hevcdsp_epel_neon.S| 179 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 6 + 2 files changed, 185 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index c077c204cc..b441f26bed 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -244,6 +244,185 @@ function ff_hevc_put_hevc_pel_pixels64_8_neon, export=1 endfunc +function ff_hevc_put_hevc_pel_bi_pixels4_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v0.s}[0], [x2], x3 // src +ushll v16.8h, v0.8b, #6 +ld1 {v20.4h}, [x4], x10 // src2 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv0.8b, v16.8h, #7 +st1 {v0.s}[0], [x0], x1 +subsw5, w5, #1 +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_pel_bi_pixels6_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +sub x1, x1, #4 +1: ld1 {v0.8b}, [x2], x3 +ushll v16.8h, v0.8b, #6 +ld1 {v20.8h}, [x4], x10 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv0.8b, v16.8h, #7 +st1 {v0.s}[0], [x0], #4 +st1 {v0.h}[2], [x0], x1 +subsw5, w5, #1 +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_pel_bi_pixels8_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v0.8b}, [x2], x3// src +ushll v16.8h, v0.8b, #6 +ld1 {v20.8h}, [x4], x10 // src2 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv0.8b, v16.8h, #7 +subsw5, w5, #1 +st1 {v0.8b}, [x0], x1 +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_pel_bi_pixels12_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +sub x1, x1, #8 +1: ld1 {v0.16b}, [x2], x3 +ushll v16.8h, v0.8b, #6 +ushll2 v17.8h, v0.16b, #6 +ld1 {v20.8h, v21.8h}, [x4], x10 +sqadd v16.8h, v16.8h, v20.8h +sqadd v17.8h, v17.8h, v21.8h +sqrshrunv0.8b, v16.8h, #7 +sqrshrun2 v0.16b, v17.8h, #7 +st1 {v0.8b}, [x0], #8 +subsw5, w5, #1 +st1 {v0.s}[2], [x0], x1 +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_pel_bi_pixels16_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v0.16b}, [x2], x3 // src +ushll v16.8h, v0.8b, #6 +ushll2 v17.8h, v0.16b, #6 +ld1 {v20.8h, v21.8h}, [x4], x10 // src2 +sqadd v16.8h, v16.8h, v20.8h +sqadd v17.8h, v17.8h, v21.8h +sqrshrunv0.8b, v16.8h, #7 +sqrshrun2 v0.16b, v17.8h, #7 +subsw5, w5, #1 +st1 {v0.16b}, [x0], x1 +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_pel_bi_pixels24_8_neon, export=1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v0.8b-v2.8b}, [x2], x3 // src +ushll v16.8h, v0.8b, #6 +ushll v17.8h, v1.8b, #6 +ushll v18.8h, v2.8b, #6 +ld1 {v20.8h-v22.8h}, [x4], x10 // src2 +sqadd v16.8h, v16.8h, v20.8h +sqadd v17.8h, v17.8h, v21.8h +sqadd v18.8h, v18.8h, v22.8h +sqrshrunv0.8b, v16.8h, #7 +sqrshrunv1.8b, v17.8h, #7 +sqrshrunv2.8b, v18.8h, #7 +subsw5, w5,
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_epel_bi_h
ffmpeg | branch: master | Logan Lyu | Sat Nov 11 17:54:35 2023 +0800| [216275bd8098fc4a08fd4c38191c8c217a6b897a] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_epel_bi_h put_hevc_epel_bi_h4_8_c: 96.0 put_hevc_epel_bi_h4_8_neon: 36.3 put_hevc_epel_bi_h6_8_c: 288.3 put_hevc_epel_bi_h6_8_neon: 59.3 put_hevc_epel_bi_h8_8_c: 358.5 put_hevc_epel_bi_h8_8_neon: 61.5 put_hevc_epel_bi_h12_8_c: 759.8 put_hevc_epel_bi_h12_8_neon: 159.5 put_hevc_epel_bi_h16_8_c: 1307.0 put_hevc_epel_bi_h16_8_neon: 182.0 put_hevc_epel_bi_h24_8_c: 2778.3 put_hevc_epel_bi_h24_8_neon: 430.5 put_hevc_epel_bi_h32_8_c: 4952.3 put_hevc_epel_bi_h32_8_neon: 679.5 put_hevc_epel_bi_h48_8_c: 11803.3 put_hevc_epel_bi_h48_8_neon: 1443.5 put_hevc_epel_bi_h64_8_c: 20654.8 put_hevc_epel_bi_h64_8_neon: 2737.0 put_hevc_qpel_bi_h4_8_c: 140.0 put_hevc_qpel_bi_h4_8_neon: 111.5 put_hevc_qpel_bi_h6_8_c: 318.0 put_hevc_qpel_bi_h6_8_neon: 85.8 put_hevc_qpel_bi_h8_8_c: 536.5 put_hevc_qpel_bi_h8_8_neon: 95.3 put_hevc_qpel_bi_h12_8_c: 1188.5 put_hevc_qpel_bi_h12_8_neon: 291.3 put_hevc_qpel_bi_h16_8_c: 2064.3 put_hevc_qpel_bi_h16_8_neon: 365.3 put_hevc_qpel_bi_h24_8_c: 4757.5 put_hevc_qpel_bi_h24_8_neon: 1010.0 put_hevc_qpel_bi_h32_8_c: 8351.8 put_hevc_qpel_bi_h32_8_neon: 2917.8 put_hevc_qpel_bi_h48_8_c: 19299.8 put_hevc_qpel_bi_h48_8_neon: 2976.8 put_hevc_qpel_bi_h64_8_c: 34182.5 put_hevc_qpel_bi_h64_8_neon: 5236.3 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=216275bd8098fc4a08fd4c38191c8c217a6b897a --- libavcodec/aarch64/hevcdsp_epel_neon.S| 257 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 262 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index b441f26bed..b84d7db1fb 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -423,6 +423,263 @@ function ff_hevc_put_hevc_pel_bi_pixels64_8_neon, export=1 ret endfunc + +function ff_hevc_put_hevc_epel_bi_h4_8_neon, export=1 +load_epel_filterb x6, x7 +sub x2, x2, #1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v4.8b}, [x2], x3 +ext v5.8b, v4.8b, v4.8b, #1 +ext v6.8b, v4.8b, v4.8b, #2 +ext v7.8b, v4.8b, v4.8b, #3 +calc_epelb v16, v4, v5, v6, v7 +ld1 {v20.4h}, [x4], x10 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv4.8b, v16.8h, #7 +st1 {v4.s}[0], [x0], x1 +subsw5, w5, #1 // height +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_epel_bi_h6_8_neon, export=1 +load_epel_filterb x6, x7 +sub w1, w1, #4 +sub x2, x2, #1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v24.16b}, [x2], x3 +ext v26.16b, v24.16b, v24.16b, #1 +ext v27.16b, v24.16b, v24.16b, #2 +ext v28.16b, v24.16b, v24.16b, #3 +calc_epelb v16, v24, v26, v27, v28 +ld1 {v20.8h}, [x4], x10 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv16.8b, v16.8h, #7 +st1 {v16.s}[0], [x0], #4 +st1 {v16.h}[2], [x0], x1 +subsw5, w5, #1 // height +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_epel_bi_h8_8_neon, export=1 +load_epel_filterb x6, x7 +sub x2, x2, #1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v24.16b}, [x2], x3 +ext v26.16b, v24.16b, v24.16b, #1 +ext v27.16b, v24.16b, v24.16b, #2 +ext v28.16b, v24.16b, v24.16b, #3 +calc_epelb v16, v24, v26, v27, v28 +ld1 {v20.8h}, [x4], x10 +sqadd v16.8h, v16.8h, v20.8h +sqrshrunv16.8b, v16.8h, #7 +st1 {v16.8b}, [x0], x1 +subsw5, w5, #1 // height +b.ne1b +ret +endfunc + +function ff_hevc_put_hevc_epel_bi_h12_8_neon, export=1 +load_epel_filterb x6, x7 +sub x1, x1, #8 +sub x2, x2, #1 +mov x10, #(MAX_PB_SIZE * 2) +1: ld1 {v24.16b}, [x2], x3 +ext v26.16b, v24.16b, v24.16b, #1 +ext v27.16b, v24.16b, v24.16b, #2 +ext v28.16b, v24.16b, v24.16b, #3 +calc_epelb v16, v24, v26, v27, v28 +calc_epelb2 v17, v24, v26, v27, v28 +ld1 {v20.8h, v21.8h}, [x4], x10 +sqadd v18.8h, v16.8h, v20.8h +sqadd v19.8h, v17.8h, v21.8h +sqrshrunv20.8b, v18.8h, #7 +sqrs
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_epel_bi_v
ffmpeg | branch: master | Logan Lyu | Sat Nov 11 19:17:36 2023 +0800| [0448f27f41457a058256f0f5145c91e88064e051] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_epel_bi_v put_hevc_epel_bi_v4_8_c: 138.4 put_hevc_epel_bi_v4_8_neon: 33.7 put_hevc_epel_bi_v6_8_c: 302.9 put_hevc_epel_bi_v6_8_neon: 46.7 put_hevc_epel_bi_v8_8_c: 408.7 put_hevc_epel_bi_v8_8_neon: 48.7 put_hevc_epel_bi_v12_8_c: 779.4 put_hevc_epel_bi_v12_8_neon: 139.7 put_hevc_epel_bi_v16_8_c: 1344.9 put_hevc_epel_bi_v16_8_neon: 160.2 put_hevc_epel_bi_v24_8_c: 2981.7 put_hevc_epel_bi_v24_8_neon: 344.9 put_hevc_epel_bi_v32_8_c: 5280.9 put_hevc_epel_bi_v32_8_neon: 618.4 put_hevc_epel_bi_v48_8_c: 12494.9 put_hevc_epel_bi_v48_8_neon: 1364.4 put_hevc_epel_bi_v64_8_c: 22127.7 put_hevc_epel_bi_v64_8_neon: 2473.7 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0448f27f41457a058256f0f5145c91e88064e051 --- libavcodec/aarch64/hevcdsp_epel_neon.S| 212 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 217 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index b84d7db1fb..2f9e7e46c4 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -680,6 +680,218 @@ function ff_hevc_put_hevc_epel_bi_h64_8_neon, export=1 ret endfunc +function ff_hevc_put_hevc_epel_bi_v4_8_neon, export=1 +load_epel_filterb x7, x6 +sub x2, x2, x3 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.s}[0], [x2], x3 +ld1 {v17.s}[0], [x2], x3 +ld1 {v18.s}[0], [x2], x3 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().s}[0], [x2], x3 +calc_epelb v4, \src0, \src1, \src2, \src3 +ld1 {v24.4h}, [x4], x10 +sqadd v4.8h, v4.8h, v24.8h +sqrshrunv4.8b, v4.8h, #7 +subsw5, w5, #1 +st1 {v4.s}[0], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_v6_8_neon, export=1 +load_epel_filterb x7, x6 +sub x2, x2, x3 +sub x1, x1, #4 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.8b}, [x2], x3 +ld1 {v17.8b}, [x2], x3 +ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().8b}, [x2], x3 +calc_epelb v4, \src0, \src1, \src2, \src3 +ld1 {v24.8h}, [x4], x10 +sqadd v4.8h, v4.8h, v24.8h +sqrshrunv4.8b, v4.8h, #7 +st1 {v4.s}[0], [x0], #4 +subsw5, w5, #1 +st1 {v4.h}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_v8_8_neon, export=1 +load_epel_filterb x7, x6 +sub x2, x2, x3 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.8b}, [x2], x3 +ld1 {v17.8b}, [x2], x3 +ld1 {v18.8b}, [x2], x3 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().8b}, [x2], x3 +calc_epelb v4, \src0, \src1, \src2, \src3 +ld1 {v24.8h}, [x4], x10 +sqadd v4.8h, v4.8h, v24.8h +sqrshrunv4.8b, v4.8h, #7 +subsw5, w5, #1 +st1 {v4.8b}, [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_v12_8_neon, export=1 +load_epel_filterb x7, x6 +sub x1, x1, #8 +sub x2, x2, x3 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.16b}, [x2], x3 +ld1 {v17.16b}, [x2], x3 +ld1 {v18.16b}, [x2], x3 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().16b}, [x2], x3 +calc_epelb v4, \src0, \src1, \src2, \src3 +calc_epelb2 v5, \src0, \src1, \src2, \src3 +ld1 {v24.8h, v25.8h}, [x4], x10 +sqadd v4.8h, v4.8h, v24.8h +sqadd v5.8h, v5.8h, v25.8h +sqrshrunv4.8b, v4.8h, #7 +sqrshrun2 v4.16b, v5.8h, #7 +st1 {v4.8b}, [x0], #8 +subsw5, w5, #1 +st1 {v4.s}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_v16_8_neon, export=1 +load_epel_filterb x7, x6 +sub x2, x2, x3 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.16b}, [x2], x3 +ld1 {v17.16b}, [x2], x3 +ld1 {v18.16b}, [x2], x3 +.mac
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_epel_bi_hv
ffmpeg | branch: master | Logan Lyu | Sat Nov 11 19:57:40 2023 +0800| [00290a64f758acafef80d88bb06760cd7bbd9eac] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_epel_bi_hv put_hevc_epel_bi_hv4_8_c: 242.9 put_hevc_epel_bi_hv4_8_i8mm: 68.6 put_hevc_epel_bi_hv6_8_c: 402.4 put_hevc_epel_bi_hv6_8_i8mm: 135.9 put_hevc_epel_bi_hv8_8_c: 636.4 put_hevc_epel_bi_hv8_8_i8mm: 145.6 put_hevc_epel_bi_hv12_8_c: 1363.1 put_hevc_epel_bi_hv12_8_i8mm: 324.1 put_hevc_epel_bi_hv16_8_c: .1 put_hevc_epel_bi_hv16_8_i8mm: 509.1 put_hevc_epel_bi_hv24_8_c: 4793.4 put_hevc_epel_bi_hv24_8_i8mm: 1091.9 put_hevc_epel_bi_hv32_8_c: 8393.9 put_hevc_epel_bi_hv32_8_i8mm: 1720.6 put_hevc_epel_bi_hv48_8_c: 19526.6 put_hevc_epel_bi_hv48_8_i8mm: 4285.9 put_hevc_epel_bi_hv64_8_c: 33915.4 put_hevc_epel_bi_hv64_8_i8mm: 6783.6 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=00290a64f758acafef80d88bb06760cd7bbd9eac --- libavcodec/aarch64/hevcdsp_epel_neon.S| 330 ++ libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + 2 files changed, 335 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S b/libavcodec/aarch64/hevcdsp_epel_neon.S index 2f9e7e46c4..2dafa09337 100644 --- a/libavcodec/aarch64/hevcdsp_epel_neon.S +++ b/libavcodec/aarch64/hevcdsp_epel_neon.S @@ -3203,6 +3203,336 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_neon_i8mm, export=1 ret endfunc + +function ff_hevc_put_hevc_epel_bi_hv4_8_neon_i8mm, export=1 +add w10, w5, #3 +lsl x10, x10, #7 +sub sp, sp, x10 // tmp_array +stp x7, x30, [sp, #-48]! +stp x4, x5, [sp, #16] +stp x0, x1, [sp, #32] +add x0, sp, #48 +sub x1, x2, x3 +mov x2, x3 +add w3, w5, #3 +mov x4, x6 +mov x5, x7 +bl X(ff_hevc_put_hevc_epel_h4_8_neon_i8mm) +ldp x4, x5, [sp, #16] +ldp x0, x1, [sp, #32] +ldp x7, x30, [sp], #48 +load_epel_filterh x7, x6 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.4h}, [sp], x10 +ld1 {v17.4h}, [sp], x10 +ld1 {v18.4h}, [sp], x10 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().4h}, [sp], x10 +calc_epelh v4, \src0, \src1, \src2, \src3 +ld1 {v6.4h}, [x4], x10 +sqadd v4.4h, v4.4h, v6.4h +sqrshrunv4.8b, v4.8h, #7 +subsw5, w5, #1 +st1 {v4.s}[0], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_hv6_8_neon_i8mm, export=1 +add w10, w5, #3 +lsl x10, x10, #7 +sub sp, sp, x10 // tmp_array +stp x7, x30, [sp, #-48]! +stp x4, x5, [sp, #16] +stp x0, x1, [sp, #32] +add x0, sp, #48 +sub x1, x2, x3 +mov x2, x3 +add w3, w5, #3 +mov x4, x6 +mov x5, x7 +bl X(ff_hevc_put_hevc_epel_h6_8_neon_i8mm) +ldp x4, x5, [sp, #16] +ldp x0, x1, [sp, #32] +ldp x7, x30, [sp], #48 +load_epel_filterh x7, x6 +sub x1, x1, #4 +mov x10, #(MAX_PB_SIZE * 2) +ld1 {v16.8h}, [sp], x10 +ld1 {v17.8h}, [sp], x10 +ld1 {v18.8h}, [sp], x10 +.macro calc src0, src1, src2, src3 +ld1 {\src3\().8h}, [sp], x10 +calc_epelh v4, \src0, \src1, \src2, \src3 +calc_epelh2 v4, v5, \src0, \src1, \src2, \src3 +ld1 {v6.8h}, [x4], x10 +sqadd v4.8h, v4.8h, v6.8h +sqrshrunv4.8b, v4.8h, #7 +st1 {v4.s}[0], [x0], #4 +subsw5, w5, #1 +st1 {v4.h}[2], [x0], x1 +.endm +1: calc_all4 +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_epel_bi_hv8_8_neon_i8mm, export=1 +add w10, w5, #3 +lsl x10, x10, #7 +sub sp, sp, x10 // tmp_array +stp x7, x30, [sp, #-48]! +stp x4, x5, [sp, #16] +stp x0, x1, [sp, #32] +add x0, sp, #48 +sub x1, x2, x3 +mov x2, x3 +add w3, w5, #3 +mov x4, x6 +mov x5, x7 +bl X(ff_hevc_put_hevc_epel_h8_8_neon_i8mm) +ldp x4, x5, [sp, #16] +ldp
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_qpel_bi_v
ffmpeg | branch: master | Logan Lyu | Sun Nov 12 08:32:10 2023 +0800| [595f97028b827a14dd979c76468e4da93b3adfd5] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_qpel_bi_v put_hevc_qpel_bi_v4_8_c: 166.1 put_hevc_qpel_bi_v4_8_neon: 61.9 put_hevc_qpel_bi_v6_8_c: 309.4 put_hevc_qpel_bi_v6_8_neon: 75.6 put_hevc_qpel_bi_v8_8_c: 531.1 put_hevc_qpel_bi_v8_8_neon: 78.1 put_hevc_qpel_bi_v12_8_c: 1139.9 put_hevc_qpel_bi_v12_8_neon: 238.1 put_hevc_qpel_bi_v16_8_c: 2063.6 put_hevc_qpel_bi_v16_8_neon: 308.9 put_hevc_qpel_bi_v24_8_c: 4317.1 put_hevc_qpel_bi_v24_8_neon: 629.9 put_hevc_qpel_bi_v32_8_c: 8241.9 put_hevc_qpel_bi_v32_8_neon: 1140.1 put_hevc_qpel_bi_v48_8_c: 18422.9 put_hevc_qpel_bi_v48_8_neon: 2533.9 put_hevc_qpel_bi_v64_8_c: 37508.6 put_hevc_qpel_bi_v64_8_neon: 4520.1 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=595f97028b827a14dd979c76468e4da93b3adfd5 --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + libavcodec/aarch64/hevcdsp_qpel_neon.S| 248 ++ 2 files changed, 253 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index c2cbcd95e7..9552549897 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -251,6 +251,10 @@ NEON8_FNPROTO_PARTIAL_5(qpel_uni_w_hv, (uint8_t *_dst, ptrdiff_t _dststride, int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width), _i8mm); +NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride, +const uint8_t *src, ptrdiff_t srcstride, const int16_t *src2, +int height, intptr_t mx, intptr_t my, int width),); + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -344,6 +348,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_epel_bi, 0, 1, epel_bi_h,); NEON8_FNASSIGN(c->put_hevc_epel_bi, 1, 0, epel_bi_v,); NEON8_FNASSIGN(c->put_hevc_qpel_bi, 0, 0, pel_bi_pixels,); +NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 0, qpel_bi_v,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 0, 0, pel_uni_pixels,); NEON8_FNASSIGN(c->put_hevc_epel_uni, 1, 0, epel_uni_v,); NEON8_FNASSIGN(c->put_hevc_qpel_uni, 0, 0, pel_uni_pixels,); diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index bcee627cba..d01dd24a78 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -865,6 +865,254 @@ function ff_hevc_put_hevc_qpel_v64_8_neon, export=1 ret endfunc +function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1 +load_qpel_filterb x7, x6 +sub x2, x2, x3, lsl #1 +sub x2, x2, x3 +mov x12, #(MAX_PB_SIZE * 2) +ld1 {v16.s}[0], [x2], x3 +ld1 {v17.s}[0], [x2], x3 +ld1 {v18.s}[0], [x2], x3 +ld1 {v19.s}[0], [x2], x3 +ld1 {v20.s}[0], [x2], x3 +ld1 {v21.s}[0], [x2], x3 +ld1 {v22.s}[0], [x2], x3 +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 +ld1 {\tmp\().s}[0], [x2], x3 +moviv24.8h, #0 +calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 +ld1 {v25.4h}, [x4], x12 // src2 +sqadd v24.8h, v24.8h, v25.8h +sqrshrunv25.8b, v24.8h, #7 +subsw5, w5, #1 +st1 {v25.s}[0], [x0], x1 +.endm +1: calc_all +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_qpel_bi_v6_8_neon, export=1 +load_qpel_filterb x7, x6 +sub x2, x2, x3, lsl #1 +sub x2, x2, x3 +ld1 {v16.8b}, [x2], x3 +sub x1, x1, #4 +ld1 {v17.8b}, [x2], x3 +mov x12, #(MAX_PB_SIZE * 2) +ld1 {v18.8b}, [x2], x3 +ld1 {v19.8b}, [x2], x3 +ld1 {v20.8b}, [x2], x3 +ld1 {v21.8b}, [x2], x3 +ld1 {v22.8b}, [x2], x3 +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 +ld1 {\tmp\().8b}, [x2], x3 +moviv24.8h, #0 +calc_qpelb v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7 +ld1 {v25.8h}, [x4], x12 // src2 +sqadd v24.8h, v24.8h, v25.8h +sqrshrunv25.8b, v24.8h, #7 +st1 {v25.s}[0], [x0], #4 +subsw5, w5, #1 +st1 {v25.h}[2], [x0], x1 +.endm +1: calc
[FFmpeg-cvslog] lavc/aarch64: new optimization for 8-bit hevc_qpel_bi_hv
ffmpeg | branch: master | Logan Lyu | Sun Nov 12 09:03:28 2023 +0800| [fa0470347e326fe1c9f54ab3dcdbdfa67fa5eddd] | committer: Martin Storsjö lavc/aarch64: new optimization for 8-bit hevc_qpel_bi_hv put_hevc_qpel_bi_hv4_8_c: 433.7 put_hevc_qpel_bi_hv4_8_i8mm: 117.9 put_hevc_qpel_bi_hv6_8_c: 803.9 put_hevc_qpel_bi_hv6_8_i8mm: 252.7 put_hevc_qpel_bi_hv8_8_c: 1296.4 put_hevc_qpel_bi_hv8_8_i8mm: 316.2 put_hevc_qpel_bi_hv12_8_c: 2867.4 put_hevc_qpel_bi_hv12_8_i8mm: 669.2 put_hevc_qpel_bi_hv16_8_c: 4709.4 put_hevc_qpel_bi_hv16_8_i8mm: 929.9 put_hevc_qpel_bi_hv24_8_c: 9639.7 put_hevc_qpel_bi_hv24_8_i8mm: 2072.4 put_hevc_qpel_bi_hv32_8_c: 16663.7 put_hevc_qpel_bi_hv32_8_i8mm: 3391.4 put_hevc_qpel_bi_hv48_8_c: 36972.9 put_hevc_qpel_bi_hv48_8_i8mm: 7505.7 put_hevc_qpel_bi_hv64_8_c: 64106.4 put_hevc_qpel_bi_hv64_8_i8mm: 13145.2 Co-Authored-By: J. Dekker Signed-off-by: Martin Storsjö > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=fa0470347e326fe1c9f54ab3dcdbdfa67fa5eddd --- libavcodec/aarch64/hevcdsp_init_aarch64.c | 5 + libavcodec/aarch64/hevcdsp_qpel_neon.S| 299 ++ 2 files changed, 304 insertions(+) diff --git a/libavcodec/aarch64/hevcdsp_init_aarch64.c b/libavcodec/aarch64/hevcdsp_init_aarch64.c index 9552549897..687b6cc5c3 100644 --- a/libavcodec/aarch64/hevcdsp_init_aarch64.c +++ b/libavcodec/aarch64/hevcdsp_init_aarch64.c @@ -255,6 +255,10 @@ NEON8_FNPROTO(qpel_bi_v, (uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, ptrdiff_t srcstride, const int16_t *src2, int height, intptr_t mx, intptr_t my, int width),); +NEON8_FNPROTO(qpel_bi_hv, (uint8_t *dst, ptrdiff_t dststride, +const uint8_t *src, ptrdiff_t srcstride, const int16_t *src2, +int height, intptr_t mx, intptr_t my, int width), _i8mm); + #define NEON8_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_hevc_put_hevc_##fn##4_8_neon##ext; \ member[2][v][h] = ff_hevc_put_hevc_##fn##6_8_neon##ext; \ @@ -370,6 +374,7 @@ av_cold void ff_hevc_dsp_init_aarch64(HEVCDSPContext *c, const int bit_depth) NEON8_FNASSIGN(c->put_hevc_qpel_uni_w, 0, 1, qpel_uni_w_h, _i8mm); NEON8_FNASSIGN(c->put_hevc_epel_uni_w, 1, 1, epel_uni_w_hv, _i8mm); NEON8_FNASSIGN_PARTIAL_5(c->put_hevc_qpel_uni_w, 1, 1, qpel_uni_w_hv, _i8mm); +NEON8_FNASSIGN(c->put_hevc_qpel_bi, 1, 1, qpel_bi_hv, _i8mm); } } diff --git a/libavcodec/aarch64/hevcdsp_qpel_neon.S b/libavcodec/aarch64/hevcdsp_qpel_neon.S index d01dd24a78..9be29cafe2 100644 --- a/libavcodec/aarch64/hevcdsp_qpel_neon.S +++ b/libavcodec/aarch64/hevcdsp_qpel_neon.S @@ -4200,5 +4200,304 @@ function ff_hevc_put_hevc_qpel_uni_w_hv64_8_neon_i8mm, export=1 ret endfunc +function ff_hevc_put_hevc_qpel_bi_hv4_8_neon_i8mm, export=1 +add w10, w5, #7 +lsl x10, x10, #7 +sub sp, sp, x10 // tmp_array +stp x7, x30, [sp, #-48]! +stp x4, x5, [sp, #16] +stp x0, x1, [sp, #32] +sub x1, x2, x3, lsl #1 +sub x1, x1, x3 +add x0, sp, #48 +mov x2, x3 +add w3, w5, #7 +mov x4, x6 +bl X(ff_hevc_put_hevc_qpel_h4_8_neon_i8mm) +ldp x4, x5, [sp, #16] +ldp x0, x1, [sp, #32] +ldp x7, x30, [sp], #48 +mov x9, #(MAX_PB_SIZE * 2) +load_qpel_filterh x7, x6 +ld1 {v16.4h}, [sp], x9 +ld1 {v17.4h}, [sp], x9 +ld1 {v18.4h}, [sp], x9 +ld1 {v19.4h}, [sp], x9 +ld1 {v20.4h}, [sp], x9 +ld1 {v21.4h}, [sp], x9 +ld1 {v22.4h}, [sp], x9 +.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7 +ld1 {\tmp\().4h}, [sp], x9 +calc_qpelh v1, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sshr +ld1 {v5.4h}, [x4], x9 // src2 +saddw v1.4s, v1.4s, v5.4h +rshrn v1.4h, v1.4s, #7 +sqxtun v1.8b, v1.8h +subsw5, w5, #1 +st1 {v1.s}[0], [x0], x1 +.endm +1: calc_all +.purgem calc +2: ret +endfunc + +function ff_hevc_put_hevc_qpel_bi_hv6_8_neon_i8mm, export=1 +add w10, w5, #7 +lsl x10, x10, #7 +sub sp, sp, x10 // tmp_array +stp x7, x30, [sp, #-48]! +stp x4, x5, [sp, #16] +stp x0, x1, [sp, #32] +sub x1, x2, x3, lsl #1 +sub x1, x1, x3 +add x0, sp, #48 +mov x2, x3 +add x3, x5, #7 +mov x4, x6 +bl X(f
[FFmpeg-cvslog] avfilter/vf_chromanr: compare correct variables for advanced mode
ffmpeg | branch: master | Paul B Mahol | Fri Dec 1 21:11:57 2023 +0100| [db7b8382376e6b49cfc44583036759be59156f22] | committer: Paul B Mahol avfilter/vf_chromanr: compare correct variables for advanced mode > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=db7b8382376e6b49cfc44583036759be59156f22 --- libavfilter/vf_chromanr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavfilter/vf_chromanr.c b/libavfilter/vf_chromanr.c index dd49d8670a..6f969f981c 100644 --- a/libavfilter/vf_chromanr.c +++ b/libavfilter/vf_chromanr.c @@ -158,7 +158,7 @@ static int distance ## _slice##name(AVFilterContext *ctx, void *arg, su += U; \ sv += V; \ cn++; \ -} else if (fun(cyY, cuU, cvV) < thres) { \ +} else if (!extra && fun(cyY, cuU, cvV) < thres) { \ su += U; \ sv += V; \ cn++; \ @@ -210,7 +210,7 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) s->thres_u = s->threshold_u * (1 << (s->depth - 8)); s->thres_v = s->threshold_v * (1 << (s->depth - 8)); -if (s->thres_y < 200.f || s->thres_u < 200.f || s->thres_v < 200.f) { +if (s->threshold_y < 200.f || s->threshold_u < 200.f || s->threshold_v < 200.f) { switch (s->distance) { case 0: s->filter_slice = s->depth <= 8 ? manhattan_e_slice8 : manhattan_e_slice16; ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avformat/mov: Fix integer overflow in mov_read_packet().
ffmpeg | branch: master | Dale Curtis | Wed Nov 22 22:17:37 2023 +| [2182173a6933c02b0853751034bd5e0bf829b5f7] | committer: Michael Niedermayer avformat/mov: Fix integer overflow in mov_read_packet(). Fixes https://crbug.com/1499669: runtime error: signed integer overflow: 9223372036853334272 + 1375731456 cannot be represented in type 'int64_t' (aka 'long') Signed-off-by: Dale Curtis Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2182173a6933c02b0853751034bd5e0bf829b5f7 --- libavformat/mov.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavformat/mov.c b/libavformat/mov.c index 34ca8095c2..f7b5ec7a35 100644 --- a/libavformat/mov.c +++ b/libavformat/mov.c @@ -9006,7 +9006,7 @@ static int mov_read_packet(AVFormatContext *s, AVPacket *pkt) pkt->flags |= AV_PKT_FLAG_DISCARD; } if (sc->ctts_data && sc->ctts_index < sc->ctts_count) { -pkt->pts = pkt->dts + sc->dts_shift + sc->ctts_data[sc->ctts_index].duration; +pkt->pts = av_sat_add64(pkt->dts, av_sat_add64(sc->dts_shift, sc->ctts_data[sc->ctts_index].duration)); /* update ctts context */ sc->ctts_sample++; if (sc->ctts_index < sc->ctts_count && ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".