[FFmpeg-cvslog] avfilter/vf_zscale: fix query_formats
ffmpeg | branch: master | Niklas Haas | Fri Jan 12 14:12:44 2024 +0100| [bfa1b7577dd646e84acafd0c82a8c2c6fe9c2a0a] | committer: Niklas Haas avfilter/vf_zscale: fix query_formats Wrong field assignment as a result of copy/paste error. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bfa1b7577dd646e84acafd0c82a8c2c6fe9c2a0a --- libavfilter/vf_zscale.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c index 3b14ce4f33..1c55282842 100644 --- a/libavfilter/vf_zscale.c +++ b/libavfilter/vf_zscale.c @@ -225,20 +225,20 @@ static int query_formats(AVFilterContext *ctx) if (ret < 0) return ret; -if ((ret = ff_formats_ref(ff_all_color_spaces(), &ctx->inputs[0]->outcfg.formats)) < 0 || -(ret = ff_formats_ref(ff_all_color_ranges(), &ctx->inputs[0]->outcfg.formats)) < 0) +if ((ret = ff_formats_ref(ff_all_color_spaces(), &ctx->inputs[0]->outcfg.color_spaces)) < 0 || +(ret = ff_formats_ref(ff_all_color_ranges(), &ctx->inputs[0]->outcfg.color_ranges)) < 0) return ret; formats = s->colorspace != ZIMG_MATRIX_UNSPECIFIED && s->colorspace > 0 ? ff_make_formats_list_singleton(s->colorspace) : ff_all_color_spaces(); -if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) +if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.color_spaces)) < 0) return ret; formats = s->range != -1 ? ff_make_formats_list_singleton(convert_range_from_zimg(s->range)) : ff_all_color_ranges(); -if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0) +if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.color_ranges)) < 0) return ret; return 0; ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avcodec/hevc: Add add_residual_4/8/16/32 asm opt
ffmpeg | branch: master | jinbo | Thu Dec 28 16:21:00 2023 +0800| [cfbdda607d02f9e23ead8252243643e167d38414] | committer: Michael Niedermayer avcodec/hevc: Add add_residual_4/8/16/32 asm opt After this patch, the peformance of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads improves 2fps (45fps-->47fsp). Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cfbdda607d02f9e23ead8252243643e167d38414 --- libavcodec/loongarch/Makefile | 3 +- libavcodec/loongarch/hevc_add_res.S | 162 ++ libavcodec/loongarch/hevcdsp_init_loongarch.c | 5 + libavcodec/loongarch/hevcdsp_lsx.h| 5 + 4 files changed, 174 insertions(+), 1 deletion(-) diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 06cfab5c20..07ea97f803 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -27,7 +27,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_lpf_sao_lsx.o \ loongarch/hevc_mc_bi_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ - loongarch/hevc_mc_uniw_lsx.o + loongarch/hevc_mc_uniw_lsx.o \ + loongarch/hevc_add_res.o LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_add_res.S b/libavcodec/loongarch/hevc_add_res.S new file mode 100644 index 00..dd2d820af8 --- /dev/null +++ b/libavcodec/loongarch/hevc_add_res.S @@ -0,0 +1,162 @@ +/* + * Loongson LSX optimized add_residual functions for HEVC decoding + * + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +/* + * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +.macro ADD_RES_LSX_4x4_8 +vldrepl.w vr0,a0, 0 +add.d t0, a0, a2 +vldrepl.w vr1,t0, 0 +vldvr2,a1, 0 + +vilvl.wvr1,vr1,vr0 +vsllwil.hu.bu vr1,vr1,0 +vadd.h vr1,vr1,vr2 +vssrani.bu.h vr1,vr1,0 + +vstelm.w vr1,a0, 0,0 +vstelm.w vr1,t0, 0,1 +.endm + +function ff_hevc_add_residual4x4_8_lsx +ADD_RES_LSX_4x4_8 +alsl.d a0, a2, a0, 1 +addi.d a1, a1, 16 +ADD_RES_LSX_4x4_8 +endfunc + +/* + * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, ptrdiff_t stride) + */ +.macro ADD_RES_LSX_8x8_8 +vldrepl.d vr0,a0, 0 +add.d t0, a0, a2 +vldrepl.d vr1,t0, 0 +add.d t1, t0, a2 +vldrepl.d vr2,t1, 0 +add.d t2, t1, a2 +vldrepl.d vr3,t2, 0 + +vldvr4,a1, 0 +addi.d t3, zero, 16 +vldx vr5,a1, t3 +addi.d t4, a1, 32 +vldvr6,t4, 0 +vldx vr7,t4, t3 + +vsllwil.hu.bu vr0,vr0,0 +vsllwil.hu.bu vr1,vr1,0 +vsllwil.hu.bu vr2,vr2,0 +vsllwil.hu.bu vr3,vr3,0 +vadd.h vr0,vr0,vr4 +vadd.h vr1,vr1,vr5 +vadd.h vr2,vr2,vr6 +vadd.h vr3,vr3,vr7 +vssrani.bu.h vr1,vr0,0 +vssrani.bu.h vr3,vr2,0 + +vstelm.d vr1,a0, 0, 0 +vstelm.d vr1,t0, 0, 1 +vstelm.d vr3,t1, 0, 0 +vstelm.d vr3,t2, 0, 1 +.endm + +function ff_hevc_add_residual8x8_8_lsx +ADD_RES_LSX_8x8_8 +alsl.d a0, a2, a0,2 +addi.d a1, a1, 64 +ADD_RES_LSX_8x8_8 +endfunc + +/* + *
[FFmpeg-cvslog] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt
ffmpeg | branch: master | jinbo | Thu Dec 28 16:21:01 2023 +0800| [a28eea2a277bb58004dc7ecccd543fa4baf69170] | committer: Michael Niedermayer avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt tests/checkasm/checkasm: C LSX LASX put_hevc_pel_uni_w_pixels4_8_c:2.7 1.0 put_hevc_pel_uni_w_pixels6_8_c:6.2 2.0 1.5 put_hevc_pel_uni_w_pixels8_8_c:10.72.5 1.7 put_hevc_pel_uni_w_pixels12_8_c: 23.05.5 5.0 put_hevc_pel_uni_w_pixels16_8_c: 41.08.2 5.0 put_hevc_pel_uni_w_pixels24_8_c: 91.019.713.2 put_hevc_pel_uni_w_pixels32_8_c: 161.7 32.516.2 put_hevc_pel_uni_w_pixels48_8_c: 354.5 73.743.0 put_hevc_pel_uni_w_pixels64_8_c: 641.5 130.0 64.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(47fps-->48fps). Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a28eea2a277bb58004dc7ecccd543fa4baf69170 --- libavcodec/loongarch/Makefile | 3 +- libavcodec/loongarch/hevc_mc.S| 471 ++ libavcodec/loongarch/hevcdsp_init_loongarch.c | 43 +++ libavcodec/loongarch/hevcdsp_lasx.h | 53 +++ libavcodec/loongarch/hevcdsp_lsx.h| 27 ++ 5 files changed, 596 insertions(+), 1 deletion(-) diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index 07ea97f803..ad98cd4054 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_mc_bi_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ loongarch/hevc_mc_uniw_lsx.o \ - loongarch/hevc_add_res.o + loongarch/hevc_add_res.o \ + loongarch/hevc_mc.o LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S new file mode 100644 index 00..c5d553effe --- /dev/null +++ b/libavcodec/loongarch/hevc_mc.S @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by jinbo + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +.macro LOAD_VAR bit +addi.w t1, a5, 6 //shift +addi.w t3, zero,1 //one +sub.w t4, t1, t3 +sll.w t3, t3, t4 //offset +.if \bit == 128 +vreplgr2vr.w vr1,a6 //wx +vreplgr2vr.w vr2,t3 //offset +vreplgr2vr.w vr3,t1 //shift +vreplgr2vr.w vr4,a7 //ox +.else +xvreplgr2vr.w xr1,a6 +xvreplgr2vr.w xr2,t3 +xvreplgr2vr.w xr3,t1 +xvreplgr2vr.w xr4,a7 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w +vldrepl.d vr0,\src0, 0 +vsllwil.hu.bu vr0,vr0, 0 +vexth.wu.huvr5,vr0 +vsllwil.wu.hu vr0,vr0, 0 +vslli.wvr0,vr0, 6 +vslli.wvr5,vr5, 6 +vmul.w vr0,vr0, vr1 +vmul.w vr5,vr5, vr1 +vadd.w vr0,vr0, vr2 +vadd.w vr5,vr5, vr2 +vsra.w vr0,vr0, vr3 +vsra.w vr5,vr5, vr3 +vadd.w vr0,vr0, vr4 +vadd.w vr5,vr5, vr4 +vssrani.h.wvr5,vr0, 0 +vssrani.bu.h vr5,vr5, 0 +.if \w == 6 +fst.s f5, \dst0, 0 +vstelm.h vr5,\dst0, 4, 2 +.else +fst.d f5, \dst0, 0 +.endif +.endm + +.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w +vldrepl.d vr0,\src0, 0 +add.d t2, \src0, a3 +vldrepl.d vr5,t2, 0 +xvpermi.q xr0,xr5, 0x02 +xvsllwi
[FFmpeg-cvslog] avfilter/vsrc_testsrc: fix colorchart black stripe
ffmpeg | branch: master | Vladimir Petrov | Tue Jan 9 18:42:19 2024 +0200| [c915dc4c5059730a5263ac8d4c99e47d13db87da] | committer: Michael Niedermayer avfilter/vsrc_testsrc: fix colorchart black stripe Fixed blackstripe on bottom or segmentation fault in case when patch width and height differ. Signed-off-by: Vladimir Petrov Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c915dc4c5059730a5263ac8d4c99e47d13db87da --- libavfilter/vsrc_testsrc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavfilter/vsrc_testsrc.c b/libavfilter/vsrc_testsrc.c index da17e950d8..3b5536badc 100644 --- a/libavfilter/vsrc_testsrc.c +++ b/libavfilter/vsrc_testsrc.c @@ -1977,7 +1977,7 @@ static void colorchart_fill_picture(AVFilterContext *ctx, AVFrame *frame) const int w = colorchart_presets[preset].w; const int h = colorchart_presets[preset].h; const int pw = test->pw; -const int ph = test->pw; +const int ph = test->ph; for (int y = 0; y < h; y++) { for (int x = 0; x < w; x++) { ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avcodec/hevc: Add asm opt for the following functions
ffmpeg | branch: master | jinbo | Thu Dec 28 16:21:04 2023 +0800| [9239081db3d355561ce5d0454db08af33c1e0356] | committer: Michael Niedermayer avcodec/hevc: Add asm opt for the following functions tests/checkasm/checkasm: C LSX LASX put_hevc_qpel_uni_h4_8_c: 5.7 1.2 put_hevc_qpel_uni_h6_8_c: 12.22.7 put_hevc_qpel_uni_h8_8_c: 21.53.2 put_hevc_qpel_uni_h12_8_c: 47.29.2 7.2 put_hevc_qpel_uni_h16_8_c: 87.011.79.0 put_hevc_qpel_uni_h24_8_c: 188.2 27.521.0 put_hevc_qpel_uni_h32_8_c: 335.2 46.728.5 put_hevc_qpel_uni_h48_8_c: 772.5 104.5 65.2 put_hevc_qpel_uni_h64_8_c: 1383.2 142.2 109.0 put_hevc_epel_uni_w_v4_8_c:5.0 1.5 put_hevc_epel_uni_w_v6_8_c:10.73.5 2.5 put_hevc_epel_uni_w_v8_8_c:18.23.7 3.0 put_hevc_epel_uni_w_v12_8_c: 40.210.77.5 put_hevc_epel_uni_w_v16_8_c: 70.213.09.2 put_hevc_epel_uni_w_v24_8_c: 158.2 30.222.5 put_hevc_epel_uni_w_v32_8_c: 281.0 52.036.5 put_hevc_epel_uni_w_v48_8_c: 631.7 116.7 82.7 put_hevc_epel_uni_w_v64_8_c: 1108.2 207.5 142.2 put_hevc_epel_uni_w_h4_8_c:4.7 1.2 put_hevc_epel_uni_w_h6_8_c:9.7 3.5 2.7 put_hevc_epel_uni_w_h8_8_c:17.24.2 3.5 put_hevc_epel_uni_w_h12_8_c: 38.011.57.2 put_hevc_epel_uni_w_h16_8_c: 69.214.59.2 put_hevc_epel_uni_w_h24_8_c: 152.0 34.722.5 put_hevc_epel_uni_w_h32_8_c: 271.0 58.040.0 put_hevc_epel_uni_w_h48_8_c: 597.5 136.7 95.0 put_hevc_epel_uni_w_h64_8_c: 1074.0 252.2 168.0 put_hevc_epel_bi_h4_8_c: 4.5 0.7 put_hevc_epel_bi_h6_8_c: 9.0 1.5 put_hevc_epel_bi_h8_8_c: 15.21.7 put_hevc_epel_bi_h12_8_c: 33.54.2 3.7 put_hevc_epel_bi_h16_8_c: 59.75.2 4.7 put_hevc_epel_bi_h24_8_c: 132.2 11.0 put_hevc_epel_bi_h32_8_c: 232.7 20.213.2 put_hevc_epel_bi_h48_8_c: 521.7 45.231.2 put_hevc_epel_bi_h64_8_c: 949.0 71.551.0 After this patch, the peformance of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads improves 1fps(55fps-->56fsp). Change-Id: I8cc1e41daa63ca478039bc55d1ee8934a7423f51 Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9239081db3d355561ce5d0454db08af33c1e0356 --- libavcodec/loongarch/hevc_mc.S| 1991 - libavcodec/loongarch/hevcdsp_init_loongarch.c | 66 + libavcodec/loongarch/hevcdsp_lasx.h | 54 + libavcodec/loongarch/hevcdsp_lsx.h| 36 +- 4 files changed, 2144 insertions(+), 3 deletions(-) diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S index 0b0647546b..a0e5938fbd 100644 --- a/libavcodec/loongarch/hevc_mc.S +++ b/libavcodec/loongarch/hevc_mc.S @@ -1784,8 +1784,12 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx endfunc const shufb -.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 -.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 +.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 //mask for epel_uni_w(128-bit) +.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 //mask for epel_uni_w(256-bit) +.byte 0,1,2,3, 4,5,6,7 ,1,2,3,4, 5,6,7,8 //mask for qpel_uni_h4 +.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for qpel_uni_h/v6/8... +.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 7,8,9,10 //epel_uni_w_h16/24/32/48/64 +.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8, 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8 //mask for bi_epel_h16/24/32/48/64 endconst .macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w @@ -2584,3 +2588,1986 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx addi.d t5, t5, -1 bnez t5, .LOOP_HV64_LASX endfunc + +/* + * void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, + *const uint8_t *_src, ptrdiff_t _srcstride, + *int height, intptr_t mx, intptr_t my, + *int width) + */ +function ff_hevc_put_hevc_uni_qpel_h4_8_lsx +addi.d t0, a5, -1 +slli.w t0, t0, 4 +la.local t1, ff_hevc_qpel_filters +vldx vr5,t1, t0 //filter +addi.d a2, a2, -3 //src -= 3 +addi.w t1, zero,32 +vreplgr2vr.h vr1,t1 +la.local t1, shufb +vldvr2,t1, 32 //mask0 0 1 +vaddi.bu vr3,vr2, 2 //mask1 2 3 +.LOOP_UNI_H4: +vldvr18, a2, 0 +vldx vr19, a2, a3 +alsl.d a2, a3, a2, 1 +vshuf.bvr6,vr18,vr18, vr2 +vshuf.bvr7,vr18,vr18, vr3 +vshuf.bvr8,
[FFmpeg-cvslog] avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 asm opt
ffmpeg | branch: master | jinbo | Thu Dec 28 16:21:02 2023 +0800| [6c6bf18ce8716c605fd7a326fd04c3d4ccac6259] | committer: Michael Niedermayer avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 asm opt tests/checkasm/checkasm: C LSX LASX put_hevc_qpel_uni_w_h4_8_c:6.5 1.7 1.2 put_hevc_qpel_uni_w_h6_8_c:14.54.5 3.7 put_hevc_qpel_uni_w_h8_8_c:24.55.7 4.5 put_hevc_qpel_uni_w_h12_8_c: 54.717.512.0 put_hevc_qpel_uni_w_h16_8_c: 96.522.713.2 put_hevc_qpel_uni_w_h24_8_c: 216.0 51.233.2 put_hevc_qpel_uni_w_h32_8_c: 385.7 87.053.2 put_hevc_qpel_uni_w_h48_8_c: 860.5 192.0 113.2 put_hevc_qpel_uni_w_h64_8_c: 1531.0 334.2 200.0 put_hevc_qpel_uni_w_v4_8_c:8.0 1.7 put_hevc_qpel_uni_w_v6_8_c:17.24.5 put_hevc_qpel_uni_w_v8_8_c:29.56.0 5.2 put_hevc_qpel_uni_w_v12_8_c: 65.216.011.7 put_hevc_qpel_uni_w_v16_8_c: 116.5 20.514.0 put_hevc_qpel_uni_w_v24_8_c: 259.2 48.537.2 put_hevc_qpel_uni_w_v32_8_c: 459.5 80.556.0 put_hevc_qpel_uni_w_v48_8_c: 1028.5 180.2 126.5 put_hevc_qpel_uni_w_v64_8_c: 1831.2 319.2 224.2 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 4fps(48fps-->52fps). Change-Id: I1178848541d90083869225ba98a02e6aa8bb8c5a Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6c6bf18ce8716c605fd7a326fd04c3d4ccac6259 --- libavcodec/loongarch/hevc_mc.S| 1294 + libavcodec/loongarch/hevcdsp_init_loongarch.c | 38 + libavcodec/loongarch/hevcdsp_lasx.h | 18 + libavcodec/loongarch/hevcdsp_lsx.h| 20 + 4 files changed, 1370 insertions(+) diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S index c5d553effe..2ee338fb8e 100644 --- a/libavcodec/loongarch/hevc_mc.S +++ b/libavcodec/loongarch/hevc_mc.S @@ -21,6 +21,8 @@ #include "loongson_asm.S" +.extern ff_hevc_qpel_filters + .macro LOAD_VAR bit addi.w t1, a5, 6 //shift addi.w t3, zero,1 //one @@ -469,3 +471,1295 @@ function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx addi.w a4, a4, -1 bnez a4, .LOOP_PIXELS64_LASX endfunc + +.macro vhaddw.d.h in0 +vhaddw.w.h \in0, \in0, \in0 +vhaddw.d.w \in0, \in0, \in0 +.endm + +.macro xvhaddw.d.h in0 +xvhaddw.w.h \in0, \in0, \in0 +xvhaddw.d.w \in0, \in0, \in0 +.endm + +function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx +LOAD_VAR 128 +ld.d t0, sp, 8 //my +addi.d t0, t0, -1 +slli.w t0, t0, 4 +la.local t1, ff_hevc_qpel_filters +vldx vr5,t1, t0 //filter +slli.d t0, a3, 1 //stride * 2 +add.d t1, t0, a3 //stride * 3 +add.d t2, t1, a3 //stride * 4 +sub.d a2, a2, t1 //src -= stride*3 +fld.s f6, a2, 0 //0 +fldx.s f7, a2, a3 //1 +fldx.s f8, a2, t0 //2 +add.d a2, a2, t1 +fld.s f9, a2, 0 //3 +fldx.s f10,a2, a3 //4 +fldx.s f11,a2, t0 //5 +fldx.s f12,a2, t1 //6 +add.d a2, a2, t2 +vilvl.bvr6,vr7, vr6 +vilvl.bvr7,vr9, vr8 +vilvl.bvr8,vr11,vr10 +vilvl.bvr9,vr13,vr12 +vilvl.hvr6,vr7, vr6 +vilvl.hvr7,vr9, vr8 +vilvl.wvr8,vr7, vr6 +vilvh.wvr9,vr7, vr6 +.LOOP_V4: +fld.s f13,a2, 0 //7 +fldx.s f14,a2, a3 //8 next loop +add.d a2, a2, t0 +vextrins.b vr8,vr13,0x70 +vextrins.b vr8,vr13,0xf1 +vextrins.b vr9,vr13,0x72 +vextrins.b vr9,vr13,0xf3 +vbsrl.vvr10, vr8, 1 +vbsrl.vvr11, vr9, 1 +vextrins.b vr10, vr14,0x70 +vextrins.b vr10, vr14,0xf1 +vextrins.b vr11, vr14,0x72 +vextrins.b vr11, vr14,0xf3 +vdp2.h.bu.bvr6,vr8, vr5 //QPEL_FILTER(src, stride) +vdp2.h.bu.bvr7,vr9, vr5 +vdp2.h.bu.bvr12, vr10,vr5 +vdp2.h.bu.bvr13, vr11,vr5 +vbsrl.vvr8,vr10,1 +vbsrl.vvr9,vr11,1 +vhaddw.d.h vr6 +vhaddw.d.h vr7 +vhaddw.d.h vr12 +vhaddw.d.h vr13 +vpickev.w vr6,vr7, vr6 +vpickev.w vr12, vr13,vr12 +vmulwev.w.hvr6,vr6, vr1 //QPEL_FILTER(src, stride) * wx +vmulwev.w.hvr12, vr12,vr1 +va
[FFmpeg-cvslog] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt
ffmpeg | branch: master | jinbo | Thu Dec 28 16:21:03 2023 +0800| [1f642b99afa073664421e9df24360c35e3ee7a73] | committer: Michael Niedermayer avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt tests/checkasm/checkasm: C LSX LASX put_hevc_epel_uni_w_hv4_8_c: 9.5 2.2 put_hevc_epel_uni_w_hv6_8_c: 18.55.0 3.7 put_hevc_epel_uni_w_hv8_8_c: 30.76.0 4.5 put_hevc_epel_uni_w_hv12_8_c: 63.714.010.7 put_hevc_epel_uni_w_hv16_8_c: 107.5 22.717.0 put_hevc_epel_uni_w_hv24_8_c: 236.7 50.231.7 put_hevc_epel_uni_w_hv32_8_c: 414.5 88.053.0 put_hevc_epel_uni_w_hv48_8_c: 917.5 197.7 118.5 put_hevc_epel_uni_w_hv64_8_c: 1617.0 349.5 203.0 After this patch, the peformance of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads improves 3fps (52fps-->55fsp). Change-Id: If067e394cec4685c62193e7adb829ac93ba4804d Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1f642b99afa073664421e9df24360c35e3ee7a73 --- libavcodec/loongarch/hevc_mc.S| 821 ++ libavcodec/loongarch/hevcdsp_init_loongarch.c | 19 + libavcodec/loongarch/hevcdsp_lasx.h | 9 + libavcodec/loongarch/hevcdsp_lsx.h| 10 + 4 files changed, 859 insertions(+) diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S index 2ee338fb8e..0b0647546b 100644 --- a/libavcodec/loongarch/hevc_mc.S +++ b/libavcodec/loongarch/hevc_mc.S @@ -22,6 +22,7 @@ #include "loongson_asm.S" .extern ff_hevc_qpel_filters +.extern ff_hevc_epel_filters .macro LOAD_VAR bit addi.w t1, a5, 6 //shift @@ -206,6 +207,12 @@ .endif .endm +/* + * void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, + * const uint8_t *_src, ptrdiff_t _srcstride, + * int height, int denom, int wx, int ox, + * intptr_t mx, intptr_t my, int width) + */ function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx LOAD_VAR 128 srli.w t0, a4, 1 @@ -482,6 +489,12 @@ endfunc xvhaddw.d.w \in0, \in0, \in0 .endm +/* + * void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, + * const uint8_t *_src, ptrdiff_t _srcstride, + * int height, int denom, int wx, int ox, + * intptr_t mx, intptr_t my, int width) + */ function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx LOAD_VAR 128 ld.d t0, sp, 8 //my @@ -1253,6 +1266,12 @@ endfunc xvssrani.bu.h \out0, xr11,0 .endm +/* + * void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, + * const uint8_t *_src, ptrdiff_t _srcstride, + * int height, int denom, int wx, int ox, + * intptr_t mx, intptr_t my, int width) + */ function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx LOAD_VAR 128 ld.d t0, sp, 0 //mx @@ -1763,3 +1782,805 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx addi.d a4, a4, -1 bnez a4, .LOOP_H64_LASX endfunc + +const shufb +.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6 +.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 +endconst + +.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w +fld.d f7, a2, 0 // start to load src +fldx.d f8, a2, a3 +alsl.d a2, a3, a2,1 +fld.d f9, a2, 0 +vshuf.bvr7,vr7, vr7, vr0 // 0123 1234 2345 3456 +vshuf.bvr8,vr8, vr8, vr0 +vshuf.bvr9,vr9, vr9, vr0 +vdp2.h.bu.bvr10, vr7, vr5 // EPEL_FILTER(src, 1) +vdp2.h.bu.bvr11, vr8, vr5 +vdp2.h.bu.bvr12, vr9, vr5 +vhaddw.w.h vr10, vr10,vr10 // tmp[0/1/2/3] +vhaddw.w.h vr11, vr11,vr11 // vr10,vr11,vr12 corresponding to EPEL_EXTRA +vhaddw.w.h vr12, vr12,vr12 +.LOOP_HV4_\w: +add.d a2, a2, a3 +fld.d f14,a2, 0// height loop begin +vshuf.bvr14, vr14,vr14, vr0 +vdp2.h.bu.bvr13, vr14,vr5 +vhaddw.w.h vr13, vr13,vr13 +vmul.w vr14, vr10,vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE) +vmadd.wvr14, vr11,vr17 +vmadd.wvr14, vr12,vr18 +vmadd.wvr14, vr13,vr19 +vaddi.wu vr10, vr11,0//back up previous value +vaddi.wu vr11, vr12,0 +vaddi.wu vr12, vr13,0 +vsrai.wvr14, vr14,6// >> 6 +vmul.w vr14, vr14,vr1 // * wx +vadd.w vr14, vr14,vr2 // + of
[FFmpeg-cvslog] avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt
ffmpeg | branch: master | yuanhecai | Thu Dec 28 16:21:05 2023 +0800| [a87a52ed0b561dc231e707ee94299561631085ee] | committer: Michael Niedermayer avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt tests/checkasm/checkasm: C LSX LASX hevc_idct_32x32_8_c: 1243.0 211.7 101.7 Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with 8 threads is 1fps(56fps-->57fps). Reviewed-by: yinshiyou...@loongson.cn Signed-off-by: Michael Niedermayer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a87a52ed0b561dc231e707ee94299561631085ee --- libavcodec/loongarch/Makefile | 3 +- libavcodec/loongarch/hevc_idct.S | 857 ++ libavcodec/loongarch/hevc_idct_lsx.c | 10 +- libavcodec/loongarch/hevcdsp_init_loongarch.c | 2 + libavcodec/loongarch/hevcdsp_lasx.h | 2 + 5 files changed, 868 insertions(+), 6 deletions(-) diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile index ad98cd4054..07da2964e4 100644 --- a/libavcodec/loongarch/Makefile +++ b/libavcodec/loongarch/Makefile @@ -29,7 +29,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER) += loongarch/hevcdsp_lsx.o \ loongarch/hevc_mc_uni_lsx.o \ loongarch/hevc_mc_uniw_lsx.o \ loongarch/hevc_add_res.o \ - loongarch/hevc_mc.o + loongarch/hevc_mc.o \ + loongarch/hevc_idct.o LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \ loongarch/h264idct_loongarch.o \ loongarch/h264dsp.o diff --git a/libavcodec/loongarch/hevc_idct.S b/libavcodec/loongarch/hevc_idct.S new file mode 100644 index 00..83c46e17d7 --- /dev/null +++ b/libavcodec/loongarch/hevc_idct.S @@ -0,0 +1,857 @@ +/* + * Copyright (c) 2023 Loongson Technology Corporation Limited + * Contributed by Hecai Yuan + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "loongson_asm.S" + +.macro fr_store +addi.dsp, sp, -64 +fst.d f24, sp, 0 +fst.d f25, sp, 8 +fst.d f26, sp, 16 +fst.d f27, sp, 24 +fst.d f28, sp, 32 +fst.d f29, sp, 40 +fst.d f30, sp, 48 +fst.d f31, sp, 56 +.endm + +.macro fr_recover +fld.d f24, sp, 0 +fld.d f25, sp, 8 +fld.d f26, sp, 16 +fld.d f27, sp, 24 +fld.d f28, sp, 32 +fld.d f29, sp, 40 +fld.d f30, sp, 48 +fld.d f31, sp, 56 +addi.dsp, sp, 64 +.endm + +.extern gt32x32_cnst1 + +.extern gt32x32_cnst2 + +.extern gt8x8_cnst + +.extern gt32x32_cnst0 + +.macro idct_16x32_step1_lasx +xvldrepl.wxr20, t1, 0 +xvldrepl.wxr21, t1, 4 +xvldrepl.wxr22, t1, 8 +xvldrepl.wxr23, t1, 12 + +xvmulwev.w.h xr16, xr8, xr20 +xvmaddwod.w.h xr16, xr8, xr20 +xvmulwev.w.h xr17, xr9, xr20 +xvmaddwod.w.h xr17, xr9, xr20 + +xvmaddwev.w.h xr16, xr10, xr21 +xvmaddwod.w.h xr16, xr10, xr21 +xvmaddwev.w.h xr17, xr11, xr21 +xvmaddwod.w.h xr17, xr11, xr21 + +xvmaddwev.w.h xr16, xr12, xr22 +xvmaddwod.w.h xr16, xr12, xr22 +xvmaddwev.w.h xr17, xr13, xr22 +xvmaddwod.w.h xr17, xr13, xr22 + +xvmaddwev.w.h xr16, xr14, xr23 +xvmaddwod.w.h xr16, xr14, xr23 +xvmaddwev.w.h xr17, xr15, xr23 +xvmaddwod.w.h xr17, xr15, xr23 + +xvld xr0, t2, 0 +xvld xr1, t2, 32 + +xvadd.w xr18, xr0, xr16 +xvadd.w xr19, xr1, xr17 +xvsub.w xr0, xr0, xr16 +xvsub.w xr1,