[FFmpeg-cvslog] avfilter/vf_zscale: fix query_formats

2024-01-12 Thread Niklas Haas
ffmpeg | branch: master | Niklas Haas  | Fri Jan 12 14:12:44 
2024 +0100| [bfa1b7577dd646e84acafd0c82a8c2c6fe9c2a0a] | committer: Niklas Haas

avfilter/vf_zscale: fix query_formats

Wrong field assignment as a result of copy/paste error.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=bfa1b7577dd646e84acafd0c82a8c2c6fe9c2a0a
---

 libavfilter/vf_zscale.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c
index 3b14ce4f33..1c55282842 100644
--- a/libavfilter/vf_zscale.c
+++ b/libavfilter/vf_zscale.c
@@ -225,20 +225,20 @@ static int query_formats(AVFilterContext *ctx)
 if (ret < 0)
 return ret;
 
-if ((ret = ff_formats_ref(ff_all_color_spaces(), 
&ctx->inputs[0]->outcfg.formats)) < 0 ||
-(ret = ff_formats_ref(ff_all_color_ranges(), 
&ctx->inputs[0]->outcfg.formats)) < 0)
+if ((ret = ff_formats_ref(ff_all_color_spaces(), 
&ctx->inputs[0]->outcfg.color_spaces)) < 0 ||
+(ret = ff_formats_ref(ff_all_color_ranges(), 
&ctx->inputs[0]->outcfg.color_ranges)) < 0)
 return ret;
 
 formats = s->colorspace != ZIMG_MATRIX_UNSPECIFIED && s->colorspace > 0
 ? ff_make_formats_list_singleton(s->colorspace)
 : ff_all_color_spaces();
-if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
+if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.color_spaces)) 
< 0)
 return ret;
 
 formats = s->range != -1
 ? ff_make_formats_list_singleton(convert_range_from_zimg(s->range))
 : ff_all_color_ranges();
-if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
+if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.color_ranges)) 
< 0)
 return ret;
 
 return 0;

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] avcodec/hevc: Add add_residual_4/8/16/32 asm opt

2024-01-12 Thread jinbo
ffmpeg | branch: master | jinbo  | Thu Dec 28 16:21:00 2023 
+0800| [cfbdda607d02f9e23ead8252243643e167d38414] | committer: Michael 
Niedermayer

avcodec/hevc: Add add_residual_4/8/16/32 asm opt

After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 2fps (45fps-->47fsp).

Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=cfbdda607d02f9e23ead8252243643e167d38414
---

 libavcodec/loongarch/Makefile |   3 +-
 libavcodec/loongarch/hevc_add_res.S   | 162 ++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   5 +
 libavcodec/loongarch/hevcdsp_lsx.h|   5 +
 4 files changed, 174 insertions(+), 1 deletion(-)

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 06cfab5c20..07ea97f803 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -27,7 +27,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)   += 
loongarch/hevcdsp_lsx.o \
  loongarch/hevc_lpf_sao_lsx.o \
  loongarch/hevc_mc_bi_lsx.o \
  loongarch/hevc_mc_uni_lsx.o \
- loongarch/hevc_mc_uniw_lsx.o
+ loongarch/hevc_mc_uniw_lsx.o \
+ loongarch/hevc_add_res.o
 LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \
  loongarch/h264idct_loongarch.o \
  loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_add_res.S 
b/libavcodec/loongarch/hevc_add_res.S
new file mode 100644
index 00..dd2d820af8
--- /dev/null
+++ b/libavcodec/loongarch/hevc_add_res.S
@@ -0,0 +1,162 @@
+/*
+ * Loongson LSX optimized add_residual functions for HEVC decoding
+ *
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+/*
+ * void ff_hevc_add_residual4x4_lsx(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_4x4_8
+vldrepl.w  vr0,a0, 0
+add.d  t0, a0, a2
+vldrepl.w  vr1,t0, 0
+vldvr2,a1, 0
+
+vilvl.wvr1,vr1,vr0
+vsllwil.hu.bu  vr1,vr1,0
+vadd.h vr1,vr1,vr2
+vssrani.bu.h   vr1,vr1,0
+
+vstelm.w   vr1,a0, 0,0
+vstelm.w   vr1,t0, 0,1
+.endm
+
+function ff_hevc_add_residual4x4_8_lsx
+ADD_RES_LSX_4x4_8
+alsl.d a0, a2, a0,   1
+addi.d a1, a1, 16
+ADD_RES_LSX_4x4_8
+endfunc
+
+/*
+ * void ff_hevc_add_residual8x8_8_lsx(uint8_t *dst, const int16_t *res, 
ptrdiff_t stride)
+ */
+.macro ADD_RES_LSX_8x8_8
+vldrepl.d  vr0,a0, 0
+add.d  t0, a0, a2
+vldrepl.d  vr1,t0, 0
+add.d  t1, t0, a2
+vldrepl.d  vr2,t1, 0
+add.d  t2, t1, a2
+vldrepl.d  vr3,t2, 0
+
+vldvr4,a1, 0
+addi.d t3, zero,   16
+vldx   vr5,a1, t3
+addi.d t4, a1, 32
+vldvr6,t4, 0
+vldx   vr7,t4, t3
+
+vsllwil.hu.bu  vr0,vr0,0
+vsllwil.hu.bu  vr1,vr1,0
+vsllwil.hu.bu  vr2,vr2,0
+vsllwil.hu.bu  vr3,vr3,0
+vadd.h vr0,vr0,vr4
+vadd.h vr1,vr1,vr5
+vadd.h vr2,vr2,vr6
+vadd.h vr3,vr3,vr7
+vssrani.bu.h   vr1,vr0,0
+vssrani.bu.h   vr3,vr2,0
+
+vstelm.d   vr1,a0, 0, 0
+vstelm.d   vr1,t0, 0, 1
+vstelm.d   vr3,t1, 0, 0
+vstelm.d   vr3,t2, 0, 1
+.endm
+
+function ff_hevc_add_residual8x8_8_lsx
+ADD_RES_LSX_8x8_8
+alsl.d a0, a2, a0,2
+addi.d a1, a1, 64
+ADD_RES_LSX_8x8_8
+endfunc
+
+/*
+ * 

[FFmpeg-cvslog] avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt

2024-01-12 Thread jinbo
ffmpeg | branch: master | jinbo  | Thu Dec 28 16:21:01 2023 
+0800| [a28eea2a277bb58004dc7ecccd543fa4baf69170] | committer: Michael 
Niedermayer

avcodec/hevc: Add pel_uni_w_pixels4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm:   C   LSX LASX
put_hevc_pel_uni_w_pixels4_8_c:2.7 1.0
put_hevc_pel_uni_w_pixels6_8_c:6.2 2.0 1.5
put_hevc_pel_uni_w_pixels8_8_c:10.72.5 1.7
put_hevc_pel_uni_w_pixels12_8_c:   23.05.5 5.0
put_hevc_pel_uni_w_pixels16_8_c:   41.08.2 5.0
put_hevc_pel_uni_w_pixels24_8_c:   91.019.713.2
put_hevc_pel_uni_w_pixels32_8_c:   161.7   32.516.2
put_hevc_pel_uni_w_pixels48_8_c:   354.5   73.743.0
put_hevc_pel_uni_w_pixels64_8_c:   641.5   130.0   64.2

Speedup of decoding H265 4K 30FPS 30Mbps on 3A6000 with
8 threads is 1fps(47fps-->48fps).

Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a28eea2a277bb58004dc7ecccd543fa4baf69170
---

 libavcodec/loongarch/Makefile |   3 +-
 libavcodec/loongarch/hevc_mc.S| 471 ++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |  43 +++
 libavcodec/loongarch/hevcdsp_lasx.h   |  53 +++
 libavcodec/loongarch/hevcdsp_lsx.h|  27 ++
 5 files changed, 596 insertions(+), 1 deletion(-)

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index 07ea97f803..ad98cd4054 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -28,7 +28,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)   += 
loongarch/hevcdsp_lsx.o \
  loongarch/hevc_mc_bi_lsx.o \
  loongarch/hevc_mc_uni_lsx.o \
  loongarch/hevc_mc_uniw_lsx.o \
- loongarch/hevc_add_res.o
+ loongarch/hevc_add_res.o \
+ loongarch/hevc_mc.o
 LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \
  loongarch/h264idct_loongarch.o \
  loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
new file mode 100644
index 00..c5d553effe
--- /dev/null
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by jinbo 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro LOAD_VAR bit
+addi.w t1, a5,  6  //shift
+addi.w t3, zero,1  //one
+sub.w  t4, t1,  t3
+sll.w  t3, t3,  t4 //offset
+.if \bit == 128
+vreplgr2vr.w   vr1,a6  //wx
+vreplgr2vr.w   vr2,t3  //offset
+vreplgr2vr.w   vr3,t1  //shift
+vreplgr2vr.w   vr4,a7  //ox
+.else
+xvreplgr2vr.w  xr1,a6
+xvreplgr2vr.w  xr2,t3
+xvreplgr2vr.w  xr3,t1
+xvreplgr2vr.w  xr4,a7
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8_LSX src0, dst0, w
+vldrepl.d  vr0,\src0,   0
+vsllwil.hu.bu  vr0,vr0, 0
+vexth.wu.huvr5,vr0
+vsllwil.wu.hu  vr0,vr0, 0
+vslli.wvr0,vr0, 6
+vslli.wvr5,vr5, 6
+vmul.w vr0,vr0, vr1
+vmul.w vr5,vr5, vr1
+vadd.w vr0,vr0, vr2
+vadd.w vr5,vr5, vr2
+vsra.w vr0,vr0, vr3
+vsra.w vr5,vr5, vr3
+vadd.w vr0,vr0, vr4
+vadd.w vr5,vr5, vr4
+vssrani.h.wvr5,vr0, 0
+vssrani.bu.h   vr5,vr5, 0
+.if \w == 6
+fst.s  f5, \dst0,   0
+vstelm.h   vr5,\dst0,   4, 2
+.else
+fst.d  f5, \dst0,   0
+.endif
+.endm
+
+.macro HEVC_PEL_UNI_W_PIXELS8x2_LASX src0, dst0, w
+vldrepl.d  vr0,\src0,   0
+add.d  t2, \src0,   a3
+vldrepl.d  vr5,t2,  0
+xvpermi.q  xr0,xr5, 0x02
+xvsllwi

[FFmpeg-cvslog] avfilter/vsrc_testsrc: fix colorchart black stripe

2024-01-12 Thread Vladimir Petrov
ffmpeg | branch: master | Vladimir Petrov  | Tue Jan  9 
18:42:19 2024 +0200| [c915dc4c5059730a5263ac8d4c99e47d13db87da] | committer: 
Michael Niedermayer

avfilter/vsrc_testsrc: fix colorchart black stripe

Fixed blackstripe on bottom or segmentation fault in case
when patch width and height differ.

Signed-off-by: Vladimir Petrov 
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=c915dc4c5059730a5263ac8d4c99e47d13db87da
---

 libavfilter/vsrc_testsrc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libavfilter/vsrc_testsrc.c b/libavfilter/vsrc_testsrc.c
index da17e950d8..3b5536badc 100644
--- a/libavfilter/vsrc_testsrc.c
+++ b/libavfilter/vsrc_testsrc.c
@@ -1977,7 +1977,7 @@ static void colorchart_fill_picture(AVFilterContext *ctx, 
AVFrame *frame)
 const int w = colorchart_presets[preset].w;
 const int h = colorchart_presets[preset].h;
 const int pw = test->pw;
-const int ph = test->pw;
+const int ph = test->ph;
 
 for (int y = 0; y < h; y++) {
 for (int x = 0; x < w; x++) {

___
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

To unsubscribe, visit link above, or email
ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-cvslog] avcodec/hevc: Add asm opt for the following functions

2024-01-12 Thread jinbo
ffmpeg | branch: master | jinbo  | Thu Dec 28 16:21:04 2023 
+0800| [9239081db3d355561ce5d0454db08af33c1e0356] | committer: Michael 
Niedermayer

avcodec/hevc: Add asm opt for the following functions

tests/checkasm/checkasm:   C   LSX LASX
put_hevc_qpel_uni_h4_8_c:  5.7 1.2
put_hevc_qpel_uni_h6_8_c:  12.22.7
put_hevc_qpel_uni_h8_8_c:  21.53.2
put_hevc_qpel_uni_h12_8_c: 47.29.2 7.2
put_hevc_qpel_uni_h16_8_c: 87.011.79.0
put_hevc_qpel_uni_h24_8_c: 188.2   27.521.0
put_hevc_qpel_uni_h32_8_c: 335.2   46.728.5
put_hevc_qpel_uni_h48_8_c: 772.5   104.5   65.2
put_hevc_qpel_uni_h64_8_c: 1383.2  142.2   109.0

put_hevc_epel_uni_w_v4_8_c:5.0 1.5
put_hevc_epel_uni_w_v6_8_c:10.73.5 2.5
put_hevc_epel_uni_w_v8_8_c:18.23.7 3.0
put_hevc_epel_uni_w_v12_8_c:   40.210.77.5
put_hevc_epel_uni_w_v16_8_c:   70.213.09.2
put_hevc_epel_uni_w_v24_8_c:   158.2   30.222.5
put_hevc_epel_uni_w_v32_8_c:   281.0   52.036.5
put_hevc_epel_uni_w_v48_8_c:   631.7   116.7   82.7
put_hevc_epel_uni_w_v64_8_c:   1108.2  207.5   142.2

put_hevc_epel_uni_w_h4_8_c:4.7 1.2
put_hevc_epel_uni_w_h6_8_c:9.7 3.5 2.7
put_hevc_epel_uni_w_h8_8_c:17.24.2 3.5
put_hevc_epel_uni_w_h12_8_c:   38.011.57.2
put_hevc_epel_uni_w_h16_8_c:   69.214.59.2
put_hevc_epel_uni_w_h24_8_c:   152.0   34.722.5
put_hevc_epel_uni_w_h32_8_c:   271.0   58.040.0
put_hevc_epel_uni_w_h48_8_c:   597.5   136.7   95.0
put_hevc_epel_uni_w_h64_8_c:   1074.0  252.2   168.0

put_hevc_epel_bi_h4_8_c:   4.5 0.7
put_hevc_epel_bi_h6_8_c:   9.0 1.5
put_hevc_epel_bi_h8_8_c:   15.21.7
put_hevc_epel_bi_h12_8_c:  33.54.2 3.7
put_hevc_epel_bi_h16_8_c:  59.75.2 4.7
put_hevc_epel_bi_h24_8_c:  132.2   11.0
put_hevc_epel_bi_h32_8_c:  232.7   20.213.2
put_hevc_epel_bi_h48_8_c:  521.7   45.231.2
put_hevc_epel_bi_h64_8_c:  949.0   71.551.0

After this patch, the peformance of decoding H265 4K 30FPS
30Mbps on 3A6000 with 8 threads improves 1fps(55fps-->56fsp).

Change-Id: I8cc1e41daa63ca478039bc55d1ee8934a7423f51
Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=9239081db3d355561ce5d0454db08af33c1e0356
---

 libavcodec/loongarch/hevc_mc.S| 1991 -
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   66 +
 libavcodec/loongarch/hevcdsp_lasx.h   |   54 +
 libavcodec/loongarch/hevcdsp_lsx.h|   36 +-
 4 files changed, 2144 insertions(+), 3 deletions(-)

diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index 0b0647546b..a0e5938fbd 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -1784,8 +1784,12 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
 endfunc
 
 const shufb
-.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
-.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
+.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6  //mask for epel_uni_w(128-bit)
+.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10 //mask for epel_uni_w(256-bit)
+.byte 0,1,2,3, 4,5,6,7 ,1,2,3,4, 5,6,7,8  //mask for qpel_uni_h4
+.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8  //mask for qpel_uni_h/v6/8...
+.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6, 4,5,6,7, 5,6,7,8, 6,7,8,9, 
7,8,9,10 //epel_uni_w_h16/24/32/48/64
+.byte 0,1,1,2, 2,3,3,4 ,4,5,5,6, 6,7,7,8, 0,1,1,2, 2,3,3,4 ,4,5,5,6, 
6,7,7,8  //mask for bi_epel_h16/24/32/48/64
 endconst
 
 .macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
@@ -2584,3 +2588,1986 @@ function ff_hevc_put_hevc_epel_uni_w_hv64_8_lasx
 addi.d t5, t5,  -1
 bnez   t5, .LOOP_HV64_LASX
 endfunc
+
+/*
+ * void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
+ *const uint8_t *_src, ptrdiff_t _srcstride,
+ *int height, intptr_t mx, intptr_t my,
+ *int width)
+ */
+function ff_hevc_put_hevc_uni_qpel_h4_8_lsx
+addi.d t0, a5,  -1
+slli.w t0, t0,  4
+la.local   t1, ff_hevc_qpel_filters
+vldx   vr5,t1,  t0 //filter
+addi.d a2, a2,  -3 //src -= 3
+addi.w t1, zero,32
+vreplgr2vr.h   vr1,t1
+la.local   t1, shufb
+vldvr2,t1,  32 //mask0 0 1
+vaddi.bu   vr3,vr2, 2  //mask1 2 3
+.LOOP_UNI_H4:
+vldvr18,   a2,  0
+vldx   vr19,   a2,  a3
+alsl.d a2, a3,  a2,   1
+vshuf.bvr6,vr18,vr18,   vr2
+vshuf.bvr7,vr18,vr18,   vr3
+vshuf.bvr8,  

[FFmpeg-cvslog] avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 asm opt

2024-01-12 Thread jinbo
ffmpeg | branch: master | jinbo  | Thu Dec 28 16:21:02 2023 
+0800| [6c6bf18ce8716c605fd7a326fd04c3d4ccac6259] | committer: Michael 
Niedermayer

avcodec/hevc: Add qpel_uni_w_v|h4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm:   C   LSX LASX
put_hevc_qpel_uni_w_h4_8_c:6.5 1.7 1.2
put_hevc_qpel_uni_w_h6_8_c:14.54.5 3.7
put_hevc_qpel_uni_w_h8_8_c:24.55.7 4.5
put_hevc_qpel_uni_w_h12_8_c:   54.717.512.0
put_hevc_qpel_uni_w_h16_8_c:   96.522.713.2
put_hevc_qpel_uni_w_h24_8_c:   216.0   51.233.2
put_hevc_qpel_uni_w_h32_8_c:   385.7   87.053.2
put_hevc_qpel_uni_w_h48_8_c:   860.5   192.0   113.2
put_hevc_qpel_uni_w_h64_8_c:   1531.0  334.2   200.0

put_hevc_qpel_uni_w_v4_8_c:8.0 1.7
put_hevc_qpel_uni_w_v6_8_c:17.24.5
put_hevc_qpel_uni_w_v8_8_c:29.56.0 5.2
put_hevc_qpel_uni_w_v12_8_c:   65.216.011.7
put_hevc_qpel_uni_w_v16_8_c:   116.5   20.514.0
put_hevc_qpel_uni_w_v24_8_c:   259.2   48.537.2
put_hevc_qpel_uni_w_v32_8_c:   459.5   80.556.0
put_hevc_qpel_uni_w_v48_8_c:   1028.5  180.2   126.5
put_hevc_qpel_uni_w_v64_8_c:   1831.2  319.2   224.2

Speedup of decoding H265 4K 30FPS 30Mbps on
3A6000 with 8 threads is 4fps(48fps-->52fps).

Change-Id: I1178848541d90083869225ba98a02e6aa8bb8c5a
Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=6c6bf18ce8716c605fd7a326fd04c3d4ccac6259
---

 libavcodec/loongarch/hevc_mc.S| 1294 +
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   38 +
 libavcodec/loongarch/hevcdsp_lasx.h   |   18 +
 libavcodec/loongarch/hevcdsp_lsx.h|   20 +
 4 files changed, 1370 insertions(+)

diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index c5d553effe..2ee338fb8e 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -21,6 +21,8 @@
 
 #include "loongson_asm.S"
 
+.extern ff_hevc_qpel_filters
+
 .macro LOAD_VAR bit
 addi.w t1, a5,  6  //shift
 addi.w t3, zero,1  //one
@@ -469,3 +471,1295 @@ function ff_hevc_put_hevc_pel_uni_w_pixels64_8_lasx
 addi.w a4, a4,  -1
 bnez   a4, .LOOP_PIXELS64_LASX
 endfunc
+
+.macro  vhaddw.d.h  in0
+vhaddw.w.h  \in0,  \in0,  \in0
+vhaddw.d.w  \in0,  \in0,  \in0
+.endm
+
+.macro  xvhaddw.d.h  in0
+xvhaddw.w.h  \in0,  \in0,  \in0
+xvhaddw.d.w  \in0,  \in0,  \in0
+.endm
+
+function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
+LOAD_VAR 128
+ld.d   t0, sp,  8  //my
+addi.d t0, t0,  -1
+slli.w t0, t0,  4
+la.local   t1, ff_hevc_qpel_filters
+vldx   vr5,t1,  t0  //filter
+slli.d t0, a3,  1  //stride * 2
+add.d  t1, t0,  a3 //stride * 3
+add.d  t2, t1,  a3 //stride * 4
+sub.d  a2, a2,  t1 //src -= stride*3
+fld.s  f6, a2,  0  //0
+fldx.s f7, a2,  a3 //1
+fldx.s f8, a2,  t0 //2
+add.d  a2, a2,  t1
+fld.s  f9, a2,  0  //3
+fldx.s f10,a2,  a3 //4
+fldx.s f11,a2,  t0 //5
+fldx.s f12,a2,  t1 //6
+add.d  a2, a2,  t2
+vilvl.bvr6,vr7, vr6
+vilvl.bvr7,vr9, vr8
+vilvl.bvr8,vr11,vr10
+vilvl.bvr9,vr13,vr12
+vilvl.hvr6,vr7, vr6
+vilvl.hvr7,vr9, vr8
+vilvl.wvr8,vr7, vr6
+vilvh.wvr9,vr7, vr6
+.LOOP_V4:
+fld.s  f13,a2,  0  //7
+fldx.s f14,a2,  a3 //8 next loop
+add.d  a2, a2,  t0
+vextrins.b vr8,vr13,0x70
+vextrins.b vr8,vr13,0xf1
+vextrins.b vr9,vr13,0x72
+vextrins.b vr9,vr13,0xf3
+vbsrl.vvr10,   vr8, 1
+vbsrl.vvr11,   vr9, 1
+vextrins.b vr10,   vr14,0x70
+vextrins.b vr10,   vr14,0xf1
+vextrins.b vr11,   vr14,0x72
+vextrins.b vr11,   vr14,0xf3
+vdp2.h.bu.bvr6,vr8, vr5 //QPEL_FILTER(src, stride)
+vdp2.h.bu.bvr7,vr9, vr5
+vdp2.h.bu.bvr12,   vr10,vr5
+vdp2.h.bu.bvr13,   vr11,vr5
+vbsrl.vvr8,vr10,1
+vbsrl.vvr9,vr11,1
+vhaddw.d.h vr6
+vhaddw.d.h vr7
+vhaddw.d.h vr12
+vhaddw.d.h vr13
+vpickev.w  vr6,vr7, vr6
+vpickev.w  vr12,   vr13,vr12
+vmulwev.w.hvr6,vr6, vr1 //QPEL_FILTER(src, stride) * wx
+vmulwev.w.hvr12,   vr12,vr1
+va

[FFmpeg-cvslog] avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt

2024-01-12 Thread jinbo
ffmpeg | branch: master | jinbo  | Thu Dec 28 16:21:03 2023 
+0800| [1f642b99afa073664421e9df24360c35e3ee7a73] | committer: Michael 
Niedermayer

avcodec/hevc: Add epel_uni_w_hv4/6/8/12/16/24/32/48/64 asm opt

tests/checkasm/checkasm:   C   LSX LASX
put_hevc_epel_uni_w_hv4_8_c:   9.5 2.2
put_hevc_epel_uni_w_hv6_8_c:   18.55.0 3.7
put_hevc_epel_uni_w_hv8_8_c:   30.76.0 4.5
put_hevc_epel_uni_w_hv12_8_c:  63.714.010.7
put_hevc_epel_uni_w_hv16_8_c:  107.5   22.717.0
put_hevc_epel_uni_w_hv24_8_c:  236.7   50.231.7
put_hevc_epel_uni_w_hv32_8_c:  414.5   88.053.0
put_hevc_epel_uni_w_hv48_8_c:  917.5   197.7   118.5
put_hevc_epel_uni_w_hv64_8_c:  1617.0  349.5   203.0

After this patch, the peformance of decoding H265 4K 30FPS 30Mbps
on 3A6000 with 8 threads improves 3fps (52fps-->55fsp).

Change-Id: If067e394cec4685c62193e7adb829ac93ba4804d
Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=1f642b99afa073664421e9df24360c35e3ee7a73
---

 libavcodec/loongarch/hevc_mc.S| 821 ++
 libavcodec/loongarch/hevcdsp_init_loongarch.c |  19 +
 libavcodec/loongarch/hevcdsp_lasx.h   |   9 +
 libavcodec/loongarch/hevcdsp_lsx.h|  10 +
 4 files changed, 859 insertions(+)

diff --git a/libavcodec/loongarch/hevc_mc.S b/libavcodec/loongarch/hevc_mc.S
index 2ee338fb8e..0b0647546b 100644
--- a/libavcodec/loongarch/hevc_mc.S
+++ b/libavcodec/loongarch/hevc_mc.S
@@ -22,6 +22,7 @@
 #include "loongson_asm.S"
 
 .extern ff_hevc_qpel_filters
+.extern ff_hevc_epel_filters
 
 .macro LOAD_VAR bit
 addi.w t1, a5,  6  //shift
@@ -206,6 +207,12 @@
 .endif
 .endm
 
+/*
+ * void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride,
+ *  const uint8_t *_src, ptrdiff_t 
_srcstride,
+ *  int height, int denom, int wx, int ox,
+ *  intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_pel_uni_w_pixels4_8_lsx
 LOAD_VAR 128
 srli.w t0, a4,  1
@@ -482,6 +489,12 @@ endfunc
 xvhaddw.d.w  \in0,  \in0,  \in0
 .endm
 
+/*
+ * void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
+ *  const uint8_t *_src, ptrdiff_t _srcstride,
+ *  int height, int denom, int wx, int ox,
+ *  intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_qpel_uni_w_v4_8_lsx
 LOAD_VAR 128
 ld.d   t0, sp,  8  //my
@@ -1253,6 +1266,12 @@ endfunc
 xvssrani.bu.h  \out0,  xr11,0
 .endm
 
+/*
+ * void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
+ *  const uint8_t *_src, ptrdiff_t _srcstride,
+ *  int height, int denom, int wx, int ox,
+ *  intptr_t mx, intptr_t my, int width)
+ */
 function ff_hevc_put_hevc_qpel_uni_w_h4_8_lsx
 LOAD_VAR 128
 ld.d   t0, sp,  0  //mx
@@ -1763,3 +1782,805 @@ function ff_hevc_put_hevc_qpel_uni_w_h64_8_lasx
 addi.d a4, a4,  -1
 bnez   a4, .LOOP_H64_LASX
 endfunc
+
+const shufb
+.byte 0,1,2,3, 1,2,3,4 ,2,3,4,5, 3,4,5,6
+.byte 4,5,6,7, 5,6,7,8 ,6,7,8,9, 7,8,9,10
+endconst
+
+.macro PUT_HEVC_EPEL_UNI_W_HV4_LSX w
+fld.d  f7, a2,  0  // start to load src
+fldx.d f8, a2,  a3
+alsl.d a2, a3,  a2,1
+fld.d  f9, a2,  0
+vshuf.bvr7,vr7, vr7,   vr0 // 0123 1234 2345 3456
+vshuf.bvr8,vr8, vr8,   vr0
+vshuf.bvr9,vr9, vr9,   vr0
+vdp2.h.bu.bvr10,   vr7, vr5  // EPEL_FILTER(src, 1)
+vdp2.h.bu.bvr11,   vr8, vr5
+vdp2.h.bu.bvr12,   vr9, vr5
+vhaddw.w.h vr10,   vr10,vr10 // tmp[0/1/2/3]
+vhaddw.w.h vr11,   vr11,vr11 // vr10,vr11,vr12 corresponding to 
EPEL_EXTRA
+vhaddw.w.h vr12,   vr12,vr12
+.LOOP_HV4_\w:
+add.d  a2, a2,  a3
+fld.d  f14,a2,  0// height loop begin
+vshuf.bvr14,   vr14,vr14,  vr0
+vdp2.h.bu.bvr13,   vr14,vr5
+vhaddw.w.h vr13,   vr13,vr13
+vmul.w vr14,   vr10,vr16 // EPEL_FILTER(tmp, MAX_PB_SIZE)
+vmadd.wvr14,   vr11,vr17
+vmadd.wvr14,   vr12,vr18
+vmadd.wvr14,   vr13,vr19
+vaddi.wu   vr10,   vr11,0//back up previous value
+vaddi.wu   vr11,   vr12,0
+vaddi.wu   vr12,   vr13,0
+vsrai.wvr14,   vr14,6// >> 6
+vmul.w vr14,   vr14,vr1  // * wx
+vadd.w vr14,   vr14,vr2  // + of

[FFmpeg-cvslog] avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt

2024-01-12 Thread yuanhecai
ffmpeg | branch: master | yuanhecai  | Thu Dec 28 
16:21:05 2023 +0800| [a87a52ed0b561dc231e707ee94299561631085ee] | committer: 
Michael Niedermayer

avcodec/hevc: Add ff_hevc_idct_32x32_lasx asm opt

tests/checkasm/checkasm:

  C  LSX   LASX
hevc_idct_32x32_8_c:  1243.0 211.7 101.7

Speedup of decoding H265 4K 30FPS 30Mbps on
3A6000 with 8 threads is 1fps(56fps-->57fps).

Reviewed-by: yinshiyou...@loongson.cn
Signed-off-by: Michael Niedermayer 

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=a87a52ed0b561dc231e707ee94299561631085ee
---

 libavcodec/loongarch/Makefile |   3 +-
 libavcodec/loongarch/hevc_idct.S  | 857 ++
 libavcodec/loongarch/hevc_idct_lsx.c  |  10 +-
 libavcodec/loongarch/hevcdsp_init_loongarch.c |   2 +
 libavcodec/loongarch/hevcdsp_lasx.h   |   2 +
 5 files changed, 868 insertions(+), 6 deletions(-)

diff --git a/libavcodec/loongarch/Makefile b/libavcodec/loongarch/Makefile
index ad98cd4054..07da2964e4 100644
--- a/libavcodec/loongarch/Makefile
+++ b/libavcodec/loongarch/Makefile
@@ -29,7 +29,8 @@ LSX-OBJS-$(CONFIG_HEVC_DECODER)   += 
loongarch/hevcdsp_lsx.o \
  loongarch/hevc_mc_uni_lsx.o \
  loongarch/hevc_mc_uniw_lsx.o \
  loongarch/hevc_add_res.o \
- loongarch/hevc_mc.o
+ loongarch/hevc_mc.o \
+ loongarch/hevc_idct.o
 LSX-OBJS-$(CONFIG_H264DSP)+= loongarch/h264idct.o \
  loongarch/h264idct_loongarch.o \
  loongarch/h264dsp.o
diff --git a/libavcodec/loongarch/hevc_idct.S b/libavcodec/loongarch/hevc_idct.S
new file mode 100644
index 00..83c46e17d7
--- /dev/null
+++ b/libavcodec/loongarch/hevc_idct.S
@@ -0,0 +1,857 @@
+/*
+ * Copyright (c) 2023 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan 
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "loongson_asm.S"
+
+.macro fr_store
+addi.dsp,   sp,   -64
+fst.d f24,  sp,   0
+fst.d f25,  sp,   8
+fst.d f26,  sp,   16
+fst.d f27,  sp,   24
+fst.d f28,  sp,   32
+fst.d f29,  sp,   40
+fst.d f30,  sp,   48
+fst.d f31,  sp,   56
+.endm
+
+.macro fr_recover
+fld.d f24,  sp,   0
+fld.d f25,  sp,   8
+fld.d f26,  sp,   16
+fld.d f27,  sp,   24
+fld.d f28,  sp,   32
+fld.d f29,  sp,   40
+fld.d f30,  sp,   48
+fld.d f31,  sp,   56
+addi.dsp,   sp,   64
+.endm
+
+.extern gt32x32_cnst1
+
+.extern gt32x32_cnst2
+
+.extern gt8x8_cnst
+
+.extern gt32x32_cnst0
+
+.macro idct_16x32_step1_lasx
+xvldrepl.wxr20, t1,   0
+xvldrepl.wxr21, t1,   4
+xvldrepl.wxr22, t1,   8
+xvldrepl.wxr23, t1,   12
+
+xvmulwev.w.h  xr16, xr8,  xr20
+xvmaddwod.w.h xr16, xr8,  xr20
+xvmulwev.w.h  xr17, xr9,  xr20
+xvmaddwod.w.h xr17, xr9,  xr20
+
+xvmaddwev.w.h xr16, xr10, xr21
+xvmaddwod.w.h xr16, xr10, xr21
+xvmaddwev.w.h xr17, xr11, xr21
+xvmaddwod.w.h xr17, xr11, xr21
+
+xvmaddwev.w.h xr16, xr12, xr22
+xvmaddwod.w.h xr16, xr12, xr22
+xvmaddwev.w.h xr17, xr13, xr22
+xvmaddwod.w.h xr17, xr13, xr22
+
+xvmaddwev.w.h xr16, xr14, xr23
+xvmaddwod.w.h xr16, xr14, xr23
+xvmaddwev.w.h xr17, xr15, xr23
+xvmaddwod.w.h xr17, xr15, xr23
+
+xvld  xr0,  t2,   0
+xvld  xr1,  t2,   32
+
+xvadd.w   xr18, xr0,  xr16
+xvadd.w   xr19, xr1,  xr17
+xvsub.w   xr0,  xr0,  xr16
+xvsub.w   xr1,