Re: [FFmpeg-devel] [PATCH v3 5/9] lavc/vp9dsp: R-V V mc avg

2024-05-17 Thread flow gg
yeah, updated it in the reply

Rémi Denis-Courmont  于2024年5月17日周五 23:11写道:

> Le maanantaina 13. toukokuuta 2024, 19.59.22 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg4_8bpp_c: 1.2
> > vp9_avg4_8bpp_rvv_i64: 1.0
> > vp9_avg8_8bpp_c: 3.7
> > vp9_avg8_8bpp_rvv_i64: 1.5
> > vp9_avg16_8bpp_c: 14.7
> > vp9_avg16_8bpp_rvv_i64: 3.5
> > vp9_avg32_8bpp_c: 57.7
> > vp9_avg32_8bpp_rvv_i64: 10.0
> > vp9_avg64_8bpp_c: 229.0
> > vp9_avg64_8bpp_rvv_i64: 31.7
> > ---
> >  libavcodec/riscv/Makefile  |  3 +-
> >  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
> >  libavcodec/riscv/vp9dsp_init.c | 18 +++
> >  3 files changed, 78 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 0cd900104f..1183357b37 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -64,6 +64,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> >  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> >  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> >   riscv/vp9_mc_rvi.o
> > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> > +  riscv/vp9_mc_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > new file mode 100644
> > index 00..5d917e7b98
> > --- /dev/null
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -0,0 +1,58 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 len an maxlen mn=m4
> > +.if \len == 4
> > +vsetivlizero, \len, e8, mf4, ta, ma
> > +.elseif \len == 8
> > +vsetivlizero, \len, e8, mf2, ta, ma
> > +.elseif \len == 16
> > +vsetivlizero, \len, e8, m1, ta, ma
> > +.elseif \len == 32
> > +li  \an, \len
> > +vsetvli zero, \an, e8, m2, ta, ma
> > +.elseif \len == 64
> > +li  \an, \maxlen
> > +vsetvli zero, \an, e8, \mn, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro copy_avg len
> > +func ff_avg\len\()_rvv, zve32x
> > +csrwi   vxrm, 0
> > +vsetvlstatic8   \len t0 64
> > +1:
> > +addia4, a4, -1
> > +vle8.v  v8, (a2)
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v8, v8, v16
> > +vse8.v  v8, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +ret
>
> Doesn't this get slightly faster by interleaving scalar and vector
> instructions?
>
> > +endfunc
> > +.endm
> > +
> > +.irp len 64, 32, 16, 8, 4
> > +copy_avg \len
> > +.endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 184fadbaf7..1922484a1d 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) }
> >  # endif
> >
> > +#if HAVE_RVV
> > +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +
> > +#define init_fpel(idx1, sz)   \
> > +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = ff_avg##sz##_rvv
> > +
> > +init_fpel(0, 64);
> > +init_fpel(1, 32);
> > +init_fpel(2, 16);
> > +init_fpel(3, 8);
> > +init_fpel(4, 4);
> > +
> > +#undef init_fpel
> > +}
> > +#endif
> >  #endif
> >  }
>
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>

Re: [FFmpeg-devel] [PATCH v4 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-18 Thread flow gg
Fixed issues with .irp and comma, as well as the ifc issue (same
modifications as previously done for vp8).

 于2024年5月19日周日 02:16写道:

> From: sunyuechi 
>
> C908:
> vp9_avg4_8bpp_c: 1.2
> vp9_avg4_8bpp_rvv_i64: 1.0
> vp9_avg8_8bpp_c: 3.7
> vp9_avg8_8bpp_rvv_i64: 1.5
> vp9_avg16_8bpp_c: 14.7
> vp9_avg16_8bpp_rvv_i64: 3.5
> vp9_avg32_8bpp_c: 57.7
> vp9_avg32_8bpp_rvv_i64: 10.0
> vp9_avg64_8bpp_c: 229.0
> vp9_avg64_8bpp_rvv_i64: 31.7
> ---
>  libavcodec/riscv/Makefile  |  3 +-
>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>  3 files changed, 78 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 27b268ae39..4739d83522 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>   riscv/vp9_mc_rvi.o
> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> +  riscv/vp9_mc_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> new file mode 100644
> index 00..7811cd9928
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -0,0 +1,58 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 len an maxlen mn=m4
> +.if \len == 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len == 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len == 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len == 32
> +li  \an, \len
> +vsetvli zero, \an, e8, m2, ta, ma
> +.elseif \len == 64
> +li  \an, \maxlen
> +vsetvli zero, \an, e8, \mn, ta, ma
> +.endif
> +.endm
> +
> +.macro copy_avg len
> +func ff_avg\len\()_rvv, zve32x
> +csrwi   vxrm, 0
> +vsetvlstatic8   \len t0 64
> +1:
> +vle8.v  v8, (a2)
> +vle8.v  v16, (a0)
> +vaaddu.vv   v8, v8, v16
> +addia4, a4, -1
> +vse8.v  v8, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +endfunc
> +.endm
> +
> +.irp len, 64, 32, 16, 8, 4
> +copy_avg \len
> +.endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index ab99294d44..6bfe23563a 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  }
>  # endif
>
> +#if HAVE_RVV
> +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128)) {
> +
> +#define init_fpel(idx1, sz)   \
> +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = ff_avg##sz##_rvv
> +
> +init_fpel(0, 64);
> +init_fpel(1, 32);
> +init_fpel(2, 16);
> +init_fpel(3, 8);
> +init_fpel(4, 4);
> +
> +#undef init_fpel
> +}
> +#endif
>  #endif
>  }
>
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/

Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp9dsp: R-V V mc bilin h v

2024-05-18 Thread flow gg
fixed in v4

Rémi Denis-Courmont  于2024年5月18日周六 23:56写道:

> Le maanantaina 13. toukokuuta 2024, 19.59.23 EEST u...@foxmail.com a
> écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg_bilin_4h_8bpp_c: 5.2
> > vp9_avg_bilin_4h_8bpp_rvv_i64: 2.2
> > vp9_avg_bilin_4v_8bpp_c: 5.5
> > vp9_avg_bilin_4v_8bpp_rvv_i64: 2.2
> > vp9_avg_bilin_8h_8bpp_c: 20.0
> > vp9_avg_bilin_8h_8bpp_rvv_i64: 4.5
> > vp9_avg_bilin_8v_8bpp_c: 21.0
> > vp9_avg_bilin_8v_8bpp_rvv_i64: 4.2
> > vp9_avg_bilin_16h_8bpp_c: 78.2
> > vp9_avg_bilin_16h_8bpp_rvv_i64: 9.0
> > vp9_avg_bilin_16v_8bpp_c: 82.0
> > vp9_avg_bilin_16v_8bpp_rvv_i64: 9.0
> > vp9_avg_bilin_32h_8bpp_c: 325.5
> > vp9_avg_bilin_32h_8bpp_rvv_i64: 26.2
> > vp9_avg_bilin_32v_8bpp_c: 326.2
> > vp9_avg_bilin_32v_8bpp_rvv_i64: 26.2
> > vp9_avg_bilin_64h_8bpp_c: 1265.7
> > vp9_avg_bilin_64h_8bpp_rvv_i64: 91.5
> > vp9_avg_bilin_64v_8bpp_c: 1317.0
> > vp9_avg_bilin_64v_8bpp_rvv_i64: 91.2
> > vp9_put_bilin_4h_8bpp_c: 4.5
> > vp9_put_bilin_4h_8bpp_rvv_i64: 1.7
> > vp9_put_bilin_4v_8bpp_c: 4.7
> > vp9_put_bilin_4v_8bpp_rvv_i64: 1.7
> > vp9_put_bilin_8h_8bpp_c: 17.0
> > vp9_put_bilin_8h_8bpp_rvv_i64: 3.5
> > vp9_put_bilin_8v_8bpp_c: 18.0
> > vp9_put_bilin_8v_8bpp_rvv_i64: 3.5
> > vp9_put_bilin_16h_8bpp_c: 65.2
> > vp9_put_bilin_16h_8bpp_rvv_i64: 7.5
> > vp9_put_bilin_16v_8bpp_c: 85.7
> > vp9_put_bilin_16v_8bpp_rvv_i64: 7.5
> > vp9_put_bilin_32h_8bpp_c: 257.5
> > vp9_put_bilin_32h_8bpp_rvv_i64: 23.5
> > vp9_put_bilin_32v_8bpp_c: 274.5
> > vp9_put_bilin_32v_8bpp_rvv_i64: 23.5
> > vp9_put_bilin_64h_8bpp_c: 1040.5
> > vp9_put_bilin_64h_8bpp_rvv_i64: 82.5
> > vp9_put_bilin_64v_8bpp_c: 1108.7
> > vp9_put_bilin_64v_8bpp_rvv_i64: 82.2
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 43 ++
> >  libavcodec/riscv/vp9dsp_init.c | 21 +
> >  2 files changed, 64 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 5d917e7b98..986cc3760d 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -53,6 +53,49 @@ func ff_avg\len\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_load dst len op type mn
> > +.ifc \type,v
> > +add t5, a2, a3
> > +.elseif \type == h
> > +addit5, a2, 1
> > +.endif
> > +vle8.v  v8, (a2)
> > +vle8.v  v0, (t5)
> > +vwmulu.vx   v16, v0, \mn
> > +vwmaccsu.vx v16, t1, v8
> > +vwadd.wxv16, v16, t4
> > +vnsra.wiv16, v16, 4
> > +vadd.vv \dst, v16, v8
> > +.ifc \op,avg
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   \dst, \dst, v16
> > +.endif
> > +.endm
> > +
> > +.macro bilin_h_v len op type mn
> > +func ff_\op\()_bilin_\len\()\type\()_rvv, zve32x
> > +.ifc \op,avg
> > +csrwi   vxrm, 0
> > +.endif
> > +vsetvlstatic8   \len t0 64
> > +li  t4, 8
> > +neg t1, \mn
> > +1:
> > +addia4, a4, -1
> > +bilin_load  v0, \len, \op, \type, \mn
> > +vse8.v  v0, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +
> > +ret
> > +endfunc
> > +.endm
> > +
> >  .irp len 64, 32, 16, 8, 4
>
> Missing comma after len
>
> >  copy_avg \len
> > +.irp op put avg
> > +bilin_h_v \len \op h a5
> > +bilin_h_v \len \op v a6
> > +.endr
> >  .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 1922484a1d..ec6db51774 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -63,6 +63,27 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) init_fpel(3, 8);
> >  init_fpel(4, 4);
> >
> > +dsp->mc[0][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_64v_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_64h_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_64v_rvv;
> > +dsp->mc[0][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_64h_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_32v_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_32h_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_32v_rvv;
> > +dsp->mc[1][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_32h_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_16v_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_16h_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_16v_rvv;
> > +dsp->mc[2][FILTER_BILINEAR ][1][1][0] = ff_avg_bilin_16h_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][0][0][1] = ff_put_bilin_8v_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][0][1][0] = ff_put_bilin_8h_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ][1][0][1] = ff_avg_bilin_8v_rvv;
> > +dsp->mc[3][FILTER_BILINEAR ]

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp8dsp: R-V V put_epel hv

2024-05-19 Thread flow gg
fix .irp use

 于2024年5月19日周日 16:18写道:

> From: sunyuechi 
>
> C908:
> vp8_put_epel4_h4v4_c: 20.0
> vp8_put_epel4_h4v4_rvv_i32: 11.0
> vp8_put_epel4_h4v6_c: 25.2
> vp8_put_epel4_h4v6_rvv_i32: 13.5
> vp8_put_epel4_h6v4_c: 22.2
> vp8_put_epel4_h6v4_rvv_i32: 14.5
> vp8_put_epel4_h6v6_c: 29.0
> vp8_put_epel4_h6v6_rvv_i32: 15.7
> vp8_put_epel8_h4v4_c: 73.0
> vp8_put_epel8_h4v4_rvv_i32: 22.2
> vp8_put_epel8_h4v6_c: 90.5
> vp8_put_epel8_h4v6_rvv_i32: 26.7
> vp8_put_epel8_h6v4_c: 85.0
> vp8_put_epel8_h6v4_rvv_i32: 27.2
> vp8_put_epel8_h6v6_c: 104.7
> vp8_put_epel8_h6v6_rvv_i32: 29.5
> vp8_put_epel16_h4v4_c: 145.5
> vp8_put_epel16_h4v4_rvv_i32: 26.5
> vp8_put_epel16_h4v6_c: 190.7
> vp8_put_epel16_h4v6_rvv_i32: 47.5
> vp8_put_epel16_h6v4_c: 173.7
> vp8_put_epel16_h6v4_rvv_i32: 33.2
> vp8_put_epel16_h6v6_c: 222.2
> vp8_put_epel16_h6v6_rvv_i32: 35.5
> ---
>  libavcodec/riscv/vp8dsp_init.c |  13 
>  libavcodec/riscv/vp8dsp_rvv.S  | 123 +++--
>  2 files changed, 115 insertions(+), 21 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index 31e8227fa4..86927907e0 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
>  c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
>  c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> +
> +c->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> +c->put_vp8_epel_pixels_tab[0][2][1] = ff_put_vp8_epel16_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> +c->put_vp8_epel_pixels_tab[0][1][1] = ff_put_vp8_epel16_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> +c->put_vp8_epel_pixels_tab[0][1][2] = ff_put_vp8_epel16_h6v4_rvv;
> +c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> +c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
>  }
>  #endif
>  #endif
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 0ba9fa443d..c79a8afacf 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -161,26 +161,26 @@ const subpel_filters
>  .byte 0,  -1,  12, 123,  -6, 0
>  endconst
>
> -.macro epel_filter size type
> -lla t2, subpel_filters
> +.macro epel_filter size type regtype
> +lla \regtype\()2, subpel_filters
>  .ifc \type,v
> -addit0, a6, -1
> +addi\regtype\()0, a6, -1
>  .else
> -addit0, a5, -1
> +addi\regtype\()0, a5, -1
>  .endif
> -li  t1, 6
> -mul t0, t0, t1
> -add t0, t0, t2
> +li  \regtype\()1, 6
> +mul \regtype\()0, \regtype\()0, \regtype\()1
> +add \regtype\()0, \regtype\()0, \regtype\()2
>  .irp n,1,2,3,4
> -lb  t\n, \n(t0)
> +lb  \regtype\n, \n(\regtype\()0)
>  .endr
>  .ifc \size,6
> -lb  t5, 5(t0)
> -lb  t0, (t0)
> +lb  \regtype\()5, 5(\regtype\()0)
> +lb  \regtype\()0, (\regtype\()0)
>  .endif
>  .endm
>
> -.macro epel_load dst len size type
> +.macro epel_load dst len size type from_mem regtype
>  .ifc \type,v
>  mv  a5, a3
>  .else
> @@ -189,24 +189,35 @@ endconst
>  sub t6, a2, a5
>  add a7, a2, a5
>
> +.if \from_mem
>  vle8.v  v24, (a2)
>  vle8.v  v22, (t6)
>  vle8.v  v26, (a7)
>  add a7, a7, a5
>  vle8.v  v28, (a7)
> -vwmulu.vx   v16, v24, t2
> -vwmulu.vx   v20, v26, t3
> +vwmulu.vx   v16, v24, \regtype\()2
> +vwmulu.vx   v20, v26, \regtype\()3
>  .ifc \size,6
>  sub t6, t6, a5
>  add a7, a7, a5
>  vle8.v  v24, (t6)
>  vle8.v  v26, (a7)
> -vwmaccu.vx  v16, t0, v24
> -vwmaccu.vx  v16, t5, v26
> +vwmaccu.vx  v16, \regtype\()0, v24
> +vwmaccu.vx  v16, \regtype\()5, v26
> +.endif
> +vwmaccsu.vx v16, \regtype\()1, v22
> +vwmaccsu.vx v16, \regtype\()4, v28
> +.else
> +vwmulu.vx   v16, v4, \regtype\()2
> +vwmulu.vx   v20, v6, \regtyp

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg
To obtain test results, need to comment out the if (w == h) in
tests/checkasm/vvc_mc.c.
Because vset needs to be used in the loop, I manually wrote a cumbersome
vset macro.

 于2024年5月21日周二 15:38写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg
There are three unused lines which I forgot to delete before submitting. I
have updated them here.

 于2024年5月21日周二 15:47写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c :  834.7  855.2
> avg_8_32x128_rvv_i32   

Re: [FFmpeg-devel] [PATCH v4 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-21 Thread flow gg
> Please put commas between operands.

Okay

> This should probably be ff_avg_vp9 or something slightly more specific.

Is it necessary here? Many macros in the C file are copied from MIPS, where
it is called ff_avg4_msa. Here, it has been simply changed to ff_avg4_rvv.

Rémi Denis-Courmont  于2024年5月21日周二 23:24写道:

> Le lauantaina 18. toukokuuta 2024, 21.15.29 EEST u...@foxmail.com a écrit
> :
> > From: sunyuechi 
> >
> > C908:
> > vp9_avg4_8bpp_c: 1.2
> > vp9_avg4_8bpp_rvv_i64: 1.0
> > vp9_avg8_8bpp_c: 3.7
> > vp9_avg8_8bpp_rvv_i64: 1.5
> > vp9_avg16_8bpp_c: 14.7
> > vp9_avg16_8bpp_rvv_i64: 3.5
> > vp9_avg32_8bpp_c: 57.7
> > vp9_avg32_8bpp_rvv_i64: 10.0
> > vp9_avg64_8bpp_c: 229.0
> > vp9_avg64_8bpp_rvv_i64: 31.7
> > ---
> >  libavcodec/riscv/Makefile  |  3 +-
> >  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
> >  libavcodec/riscv/vp9dsp_init.c | 18 +++
> >  3 files changed, 78 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..4739d83522 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -65,6 +65,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
> >  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> >  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
> >   riscv/vp9_mc_rvi.o
> > -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> > +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> > +  riscv/vp9_mc_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > new file mode 100644
> > index 00..7811cd9928
> > --- /dev/null
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -0,0 +1,58 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 len an maxlen mn=m4
>
> Please put commas between operands.
>
> > +.if \len == 4
> > +vsetivlizero, \len, e8, mf4, ta, ma
> > +.elseif \len == 8
> > +vsetivlizero, \len, e8, mf2, ta, ma
> > +.elseif \len == 16
> > +vsetivlizero, \len, e8, m1, ta, ma
> > +.elseif \len == 32
> > +li  \an, \len
> > +vsetvli zero, \an, e8, m2, ta, ma
> > +.elseif \len == 64
> > +li  \an, \maxlen
> > +vsetvli zero, \an, e8, \mn, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro copy_avg len
> > +func ff_avg\len\()_rvv, zve32x
>
> This should probably be ff_avg_vp9 or something slightly more specific.
>
> > +csrwi   vxrm, 0
> > +vsetvlstatic8   \len t0 64
> > +1:
> > +vle8.v  v8, (a2)
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v8, v8, v16
> > +addia4, a4, -1
> > +vse8.v  v8, (a0)
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +ret
> > +endfunc
> > +.endm
> > +
> > +.irp len, 64, 32, 16, 8, 4
> > +copy_avg \len
> > +.endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index ab99294d44..6bfe23563a 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) }
> >  # endif
> >
> > +#if HAVE_RVV
> > +if (bpp == 8 && (flags & AV_CPU_FLAG_RVV_I32) &&
> ff_rv_vlen_least(128))
> > { +
> > +#define init_fpel(idx1, sz)   \
> > +dsp->mc[idx1][FILTER_8TAP_SMOOTH ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_REGULAR][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_8TAP_SHARP  ][1][0][0] = ff_avg##sz##_rvv;  \
> > +dsp->mc[idx1][FILTER_BILINEAR][1][0][0] = f

Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-21 Thread flow gg
> Please put commas between operands.
> This should probably be ff_avg_vp9 or something slightly more specific.

Updated here.

 于2024年5月22日周三 01:14写道:

> From: sunyuechi 
>
> C908:
> vp9_avg4_8bpp_c: 1.2
> vp9_avg4_8bpp_rvv_i64: 1.0
> vp9_avg8_8bpp_c: 3.7
> vp9_avg8_8bpp_rvv_i64: 1.5
> vp9_avg16_8bpp_c: 14.7
> vp9_avg16_8bpp_rvv_i64: 3.5
> vp9_avg32_8bpp_c: 57.7
> vp9_avg32_8bpp_rvv_i64: 10.0
> vp9_avg64_8bpp_c: 229.0
> vp9_avg64_8bpp_rvv_i64: 31.7
> ---
>  libavcodec/riscv/Makefile  |  3 +-
>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>  libavcodec/riscv/vp9dsp.h  |  4 +--
>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>  4 files changed, 80 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 07d5c2915d..67e198d754 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>   riscv/vp9_mc_rvi.o
> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
> +  riscv/vp9_mc_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> new file mode 100644
> index 00..7cb38ec94a
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -0,0 +1,58 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.macro vsetvlstatic8 len an maxlen mn=m4
> +.if \len == 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len == 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len == 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len == 32
> +li  \an, \len
> +vsetvli zero, \an, e8, m2, ta, ma
> +.elseif \len == 64
> +li  \an, \maxlen
> +vsetvli zero, \an, e8, \mn, ta, ma
> +.endif
> +.endm
> +
> +.macro copy_avg len
> +func ff_vp9_avg\len\()_rvv, zve32x
> +csrwi   vxrm, 0
> +vsetvlstatic8   \len, t0, 64
> +1:
> +vle8.v  v8, (a2)
> +vle8.v  v16, (a0)
> +vaaddu.vv   v8, v8, v16
> +addia4, a4, -1
> +vse8.v  v8, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +endfunc
> +.endm
> +
> +.irp len, 64, 32, 16, 8, 4
> +copy_avg \len
> +.endr
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 79330b4968..ff8431591c 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -138,11 +138,11 @@ void ff_avg_bilin_##SIZE##hv_rvv(uint8_t *dst,
> ptrdiff_t dststride,\
>   int h, int mx, int my);
>
>  #define VP9_COPY_AVG_RISCV_RVV_FUNC(SIZE)   \
> -void ff_copy##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,\
> +void ff_vp9_copy##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride,\
>   const uint8_t *src, ptrdiff_t srcstride,  \
>   int h, int mx, int my);   \
> \
> -void ff_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
> +void ff_vp9_avg##SIZE##_rvv(uint8_t *dst, ptrdiff_t dststride, \
>  const uint8_t *src, ptrdiff_t srcstride,   \
>  int h, int mx, int my);
>
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index ab99294d44..454dcd963f 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -48,6 +48,24 @@ static av_cold void vp9dsp_mc_init_risc

Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg
> I would expect that you can get better performance by interleaving scalar
and
vector stuff, and possibly also vector loads and vector arithmetic.

Okay, I will try

> These labels lead to nowhere? If you actually mean to implicitly fall
through
to the next function, you can use the function name directly rather than add
odd labels.

These labels are used to convert variable parameters to constants to
achieve better performance and prepare for the next .irp. Some names are
strange because they cannot be duplicated. Here, there is only one
function, which should be executed after going through these labels?

Rémi Denis-Courmont  于2024年5月22日周三 00:04写道:

> Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> > ---
> >  libavcodec/riscv/Makefile  |   2 +
> >  libavcodec/riscv/vvc_mc_rvv.S  | 312 +
> >  libavcodec/riscv/vvcdsp_init.c |  76 
> >  libavcodec/vvc/dsp.c   |   4 +-
> >  libavcodec/vvc/dsp.h   |   1 +
> >  5 files changed, 394 insertions(+), 1 deletion(-)
> >  create mode 100644 libavcodec/riscv/vvc_mc_rvv.S
> >  create mode 100644 libavcodec/riscv/vvcdsp_init.c
> >
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 27b268ae39..6297664fc9 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
> \
> >  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o
> > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o
> > diff --git a/libavcodec/riscv/vvc_mc_rvv.S
> b/libavcodec/riscv/vvc_mc_rvv.S
> > new file mode 100644
> > index 00..26a6afba1f
> > --- /dev/null
> > +++ b/libavcodec/riscv/vvc_mc_rvv.S
> > @@ -0,0 +1,312 @@
> > +/*
> > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> > (ISCAS). + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +.macro vsetvlstatic8 w vlen is_w
> > +.if \w <= 2
> > +vsetivlizero, \w, e8, mf8, ta, ma
> > +.elseif \w <= 4 && \vlen == 128
> > +vsetivlizero, \w, e8, mf4, ta, ma
> > +.elseif \w <= 4 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf8, ta, ma
> > +.elseif \w <= 8 && \vlen == 128
> > +vsetivlizero, \w, e8, mf2, ta, ma
> > +.elseif \w <= 8 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf4, ta, ma
> > +.elseif \w <= 16 && \vlen == 128
> > +vsetivlizero, \w, e8, m1, ta, ma
> > +.elseif \w <= 16 && \vlen >= 256
> > +vsetivlizero, \w, e8, mf2, ta, ma
> > +.elseif \w <= 32 && \vlen >= 256
> > +li t0, \w
> > +vsetvli zero, t0, e8, m1, ta, ma
> > +.elseif \w <= (\vlen / 4) || \is_w
> > +li t0, 64
> > +vsetvli zero, t0, e8, m2, ta, ma
> > +.else
> > +li t0, \w
> > +vsetvli zero, t0, e8, m4, ta, ma
> > +.endif
> > +.endm
> > +
> > +.macro vsetvlstatic16 w vlen is_w
> > +.if \w <= 2
> > +vsetivlizero, \w, e16, mf4, ta, ma
> > +.elseif \w <= 4 && \vlen == 128
> > +vsetivlizero, \w, e16, mf2, ta, ma
> > +.elseif \w <= 4 && \vlen >= 256
> > +vsetivlizero, \w, e16, mf4, ta, ma
> > +.elseif \w <= 8 && \vlen == 128
> > +vsetivlizero, \w, e16, m1, ta, ma
> > +.elseif \w <= 8 && \vlen >= 256
> > +vsetivlizero, \w, e16, mf2, ta, ma
> > +.elseif \w <= 16 && \vlen == 128
> > +vsetivlizero, \w, e16, m2, ta, ma
> > +.elseif \w <= 16 && \vlen >= 256
> > +vsetivlizero, \w, e16, m1, ta, ma
> > +.elseif \w <= 32 && \vlen >= 25

Re: [FFmpeg-devel] [PATCH v2 2/5] lavc/vp9dsp: R-V V mc bilin h v

2024-05-21 Thread flow gg
Do macros definition also need a comma? I noticed that many of my old code
and SiFive's code don't have a comma

Rémi Denis-Courmont  于2024年5月22日周三 02:29写道:

> Le tiistaina 21. toukokuuta 2024, 20.13.16 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
>
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 7cb38ec94a..739380d9a9 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -53,6 +53,49 @@ func ff_vp9_avg\len\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_load dst len op type mn
>
> Commas, please.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vvc_mc: R-V V avg w_avg

2024-05-21 Thread flow gg
Reordered some here.

 于2024年5月22日周三 03:24写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :0.70.7
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.00.7
> avg_8_2x8_c:4.03.7
> avg_8_2x8_rvv_i32  :1.51.2
> avg_8_2x16_c   :7.57.7
> avg_8_2x16_rvv_i32 :2.72.5
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.04.5
> avg_8_2x64_c   :   28.5   30.2
> avg_8_2x64_rvv_i32 :9.58.7
> avg_8_2x128_c  :   80.0   70.5
> avg_8_2x128_rvv_i32:   50.7   41.2
> avg_8_4x2_c:1.72.0
> avg_8_4x2_rvv_i32  :0.70.7
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.21.0
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :1.51.2
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :2.72.5
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.04.5
> avg_8_4x64_c   :   52.2   55.0
> avg_8_4x64_rvv_i32 :9.58.7
> avg_8_4x128_c  :  146.0  117.5
> avg_8_4x128_rvv_i32:   53.2   40.5
> avg_8_8x2_c:3.53.5
> avg_8_8x2_rvv_i32  :0.70.7
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.21.0
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.01.5
> avg_8_8x16_c   :   25.2   26.2
> avg_8_8x16_rvv_i32 :3.52.5
> avg_8_8x32_c   :   50.0   52.7
> avg_8_8x32_rvv_i32 :6.54.7
> avg_8_8x64_c   :   99.7  105.0
> avg_8_8x64_rvv_i32 :   12.58.5
> avg_8_8x128_c  :  225.7  218.0
> avg_8_8x128_rvv_i32:   78.0   39.2
> avg_8_16x2_c   :6.26.7
> avg_8_16x2_rvv_i32 :1.20.7
> avg_8_16x4_c   :   12.2   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.51.7
> avg_8_16x16_c  :   49.0   51.5
> avg_8_16x16_rvv_i32:6.23.2
> avg_8_16x32_c  :   97.5  102.5
> avg_8_16x32_rvv_i32:   11.55.7
> avg_8_16x64_c  :  212.5  204.7
> avg_8_16x64_rvv_i32:   22.5   11.0
> avg_8_16x128_c :  411.2  418.2
> avg_8_16x128_rvv_i32   :   76.0   47.7
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.2   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.73.2
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.75.7
> avg_8_32x32_c  :  192.5  202.5
> avg_8_32x32_rvv_i32:   20.7   10.5
> avg_8_32x64_c  :  411.2  404.5
> avg_8_32x64_rvv_i32:   41.0   20.5
> avg_8_32x128_c :  834.7  855.2
> avg_8_32x128_rvv_i32   :  151.2  118.7
> avg_8_64x2_c

Re: [FFmpeg-devel] [PATCH] lavc/rv34dsp: optimise R-V V idct_dc_add

2024-05-22 Thread flow gg
Unfortunately I only test to obtain benchmarks and basic correctness. I
always feel the need for a professional to write the tests.

Rémi Denis-Courmont  于2024年5月23日周四 04:35写道:

>
>
> Le 22 mai 2024 23:28:54 GMT+03:00, "Rémi Denis-Courmont" 
> a écrit :
> >This removes one stray LI and reworks the vector arithmetic to avoid
> >changing the vector configuration. On K230, this takes the 46.5 cycle
> >count down from 46.5 to 43.5.
> >---
> > libavcodec/riscv/rv34dsp_rvv.S | 13 ++---
> > 1 file changed, 6 insertions(+), 7 deletions(-)
> >
> >diff --git a/libavcodec/riscv/rv34dsp_rvv.S
> b/libavcodec/riscv/rv34dsp_rvv.S
> >index f1f6345012..e8aff7e570 100644
> >--- a/libavcodec/riscv/rv34dsp_rvv.S
> >+++ b/libavcodec/riscv/rv34dsp_rvv.S
> >@@ -36,16 +36,15 @@ func ff_rv34_idct_dc_add_rvv, zve32x
> > vsetivli  zero, 4, e8, mf4, ta, ma
> > vlse32.v  v0, (a0), a1
> > lit1, 169
> >+lit2, 128
> > mul   t1, t1, a2
> >-lia2, 255
> >+vsetivli  zero, 4*4, e8, m1, ta, ma
> >+vwsubu.vx v2, v0, t2
> > addi  t1, t1, 512
> > srai  t1, t1, 10
> >-vsetivli  zero, 4*4, e16, m2, ta, ma
> >-vzext.vf2 v2, v0
> >-vadd.vx   v2, v2, t1
> >-vmax.vx   v2, v2, zero
> >-vsetvli   zero, zero, e8, m1, ta, ma
> >-vnclipu.wiv0, v2, 0
> >+vwadd.wx  v2, v2, t1
>
> Hmm, this should not work, as t1 has more than 8 bits. Maybe checkasm is
> sloppy here.
>
> >+vnclip.wi v0, v2, 0
> >+vxor.vx   v0, v0, t2
> > vsetivli  zero, 4, e8, mf4, ta, ma
> > vsse32.v  v0, (a0), a1
> >
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-23 Thread flow gg
I want to update the VP9 bilin load, just like you did with VP8, but it
seems like this patch([PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg) doesn't
merge the current updates here but merges the previous version instead, so
the subsequent patches will have conflicts.

flow gg  于2024年5月22日周三 01:15写道:

> > Please put commas between operands.
> > This should probably be ff_avg_vp9 or something slightly more specific.
>
> Updated here.
>
>  于2024年5月22日周三 01:14写道:
>
>> From: sunyuechi 
>>
>> C908:
>> vp9_avg4_8bpp_c: 1.2
>> vp9_avg4_8bpp_rvv_i64: 1.0
>> vp9_avg8_8bpp_c: 3.7
>> vp9_avg8_8bpp_rvv_i64: 1.5
>> vp9_avg16_8bpp_c: 14.7
>> vp9_avg16_8bpp_rvv_i64: 3.5
>> vp9_avg32_8bpp_c: 57.7
>> vp9_avg32_8bpp_rvv_i64: 10.0
>> vp9_avg64_8bpp_c: 229.0
>> vp9_avg64_8bpp_rvv_i64: 31.7
>> ---
>>  libavcodec/riscv/Makefile  |  3 +-
>>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>>  libavcodec/riscv/vp9dsp.h  |  4 +--
>>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>>  4 files changed, 80 insertions(+), 3 deletions(-)
>>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>>
>> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
>> index 07d5c2915d..67e198d754 100644
>> --- a/libavcodec/riscv/Makefile
>> +++ b/libavcodec/riscv/Makefile
>> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>>   riscv/vp9_mc_rvi.o
>> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
>> +  riscv/vp9_mc_rvv.o
>>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
>> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
>> new file mode 100644
>> index 00..7cb38ec94a
>> --- /dev/null
>> +++ b/libavcodec/riscv/vp9_mc_rvv.S
>> @@ -0,0 +1,58 @@
>> +/*
>> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
>> (ISCAS).
>> + *
>> + * This file is part of FFmpeg.
>> + *
>> + * FFmpeg is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * FFmpeg is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with FFmpeg; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>> 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/riscv/asm.S"
>> +
>> +.macro vsetvlstatic8 len an maxlen mn=m4
>> +.if \len == 4
>> +vsetivlizero, \len, e8, mf4, ta, ma
>> +.elseif \len == 8
>> +vsetivlizero, \len, e8, mf2, ta, ma
>> +.elseif \len == 16
>> +vsetivlizero, \len, e8, m1, ta, ma
>> +.elseif \len == 32
>> +li  \an, \len
>> +vsetvli zero, \an, e8, m2, ta, ma
>> +.elseif \len == 64
>> +li  \an, \maxlen
>> +vsetvli zero, \an, e8, \mn, ta, ma
>> +.endif
>> +.endm
>> +
>> +.macro copy_avg len
>> +func ff_vp9_avg\len\()_rvv, zve32x
>> +csrwi   vxrm, 0
>> +vsetvlstatic8   \len, t0, 64
>> +1:
>> +vle8.v  v8, (a2)
>> +vle8.v  v16, (a0)
>> +vaaddu.vv   v8, v8, v16
>> +addia4, a4, -1
>> +vse8.v  v8, (a0)
>> +add a2, a2, a3
>> +add a0, a0, a1
>> +bneza4, 1b
>> +ret
>> +endfunc
>> +.endm
>> +
>> +.irp len, 64, 32, 16, 8, 4
>> +copy_avg \len
>> +.endr
>> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
>> index 79330b4968..ff8431591c 100644
>> --- a/libavcodec/riscv/vp9dsp.h
>> +++ b/libavcodec/riscv/vp9dsp.h
>> @@ -138,11 +138,11 @@ void ff_a

Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-25 Thread flow gg
> Is there a reason that you cannot use the tables from C code?

Similar to VP8, to adjust the positive and negative data and prevent small
probability overflow during calculations.

> AFAICT, regular and sharp are identical, except for the base address of
the
> filter table, so it should be possible to share the byte code

Initially, they used the same code, but after testing hundreds of times,
there were always a few failures...

Because the data in the table is different, when regular, sharp, and smooth
use the same code, there will always be a small amount of overflow.
Different signed and unsigned calculations are needed.

> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Got it, I will try to update.

Rémi Denis-Courmont  于2024年5月25日周六 18:17写道:

> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >  C908   X60
> > vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
> > vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
> > vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
> > vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
> > vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
> > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
> > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
> > vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
> > vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> > vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
> > vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
> > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
> > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +
> >  libavcodec/riscv/vp9dsp.h  |  72 ++
> >  libavcodec/riscv/vp9dsp_init.c |  38 +-
> >  3 files changed, 328 insertions(+), 25 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index 739380d9a9..adba4afb90 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -36,6 +36,18 @@
> >  .endif
> >  .endm
> >
> > +.macro vsetvlstatic16 len
> > +.ifc \len,4
> > +vsetvli zero, zero, e16, mf2, ta, ma
> > +.elseif \len == 8
> > +vsetvli zero, zero, e16, m1, ta, ma
> > +.elseif \len == 16
> > +vsetvli zero, zero, e16, m2, ta, ma
> > +.else
> > +vsetvli zero, zero, e16, m4, ta, ma
> > +.endif
> > +.endm
> > +
> >  .macro copy_avg len
> >  func ff_vp9_avg\len\()_rvv, zve32x
> >  csrwi   vxrm, 0
> > @@ -92,10 +104,241 @@ func ff_\op\()_v

Re: [FFmpeg-devel] [PATCH v2 3/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-25 Thread flow gg
One more thing I remember is that after adjusting the sign, vmacc can be
used; otherwise, due to the sign, mul + add are needed.

flow gg  于2024年5月25日周六 18:38写道:

> > Is there a reason that you cannot use the tables from C code?
>
> Similar to VP8, to adjust the positive and negative data and prevent small
> probability overflow during calculations.
>
> > AFAICT, regular and sharp are identical, except for the base address of
> the
> > filter table, so it should be possible to share the byte code
>
> Initially, they used the same code, but after testing hundreds of times,
> there were always a few failures...
>
> Because the data in the table is different, when regular, sharp, and
> smooth use the same code, there will always be a small amount of overflow.
> Different signed and unsigned calculations are needed.
>
> > A French philosopher famously said that Perfect is the ennemy of Good.
> > Generally, as with VVC, nested repetition macros for finely specialised
> > functions tend to generate way too much byte code, and this ends up being
> > worse rather than better in the big picture.
>
> Got it, I will try to update.
>
> Rémi Denis-Courmont  于2024年5月25日周六 18:17写道:
>
>> Le tiistaina 21. toukokuuta 2024, 20.13.17 EEST u...@foxmail.com a écrit
>> :
>> > From: sunyuechi 
>> >
>> >  C908   X60
>> > vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
>> > vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
>> > vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
>> > vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
>> > vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
>> > vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
>> > vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
>> > vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
>> > vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
>> > vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
>> > vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
>> > vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
>> > vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
>> > vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
>> > vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
>> > vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
>> > vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
>> > vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
>> > vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
>> > vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
>> > vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
>> > vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
>> > vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
>> > vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
>> > vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
>> > vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
>> > vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
>> > vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
>> > vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
>> > vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
>> > vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
>> > vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
>> > vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
>> > vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
>> > vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
>> > vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
>> > vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
>> > vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
>> > vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
>> > vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
>> > ---
>> >  libavcodec/riscv/vp9_mc_rvv.S  | 243 +
>> >  libavcodec/riscv/vp9dsp.h  |  72 ++
>> >  libavcodec/riscv/vp9dsp_init.c |  38 +-
>> >  3 files change

Re: [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: factor R-V V EPEL functions for all lengths

2024-05-25 Thread flow gg
Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16 with
two vsetvl? This would require the previous patch and this one to work
together, increasing the number of lines of code and making the code a bit
harder to read.
Additionally, I have a question about patch 4 'save one R-V GPR' and patch
5. Should they be submitted as a single patch? Because patch 4 looks
similar to what I initially submitted, and you suggested changing it to
save lines of code. If it is only for patch 5, shouldn't they be combined
together?

Rémi Denis-Courmont  于2024年5月25日周六 23:39写道:

> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 56 ---
>  1 file changed, 32 insertions(+), 24 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index a4fcd158a5..002e7f3174 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -32,16 +32,6 @@
>  .endif
>  .endm
>
> -.macro vsetvlstatic16 len
> -.if \len <= 4
> -vsetivlizero, \len, e16, mf2, ta, ma
> -.elseif \len <= 8
> -vsetivlizero, \len, e16, m1, ta, ma
> -.elseif \len <= 16
> -vsetivlizero, \len, e16, m2, ta, ma
> -.endif
> -.endm
> -
>  .macro vp8_idct_dc_add
>  vlse32.v  v0, (a0), a2
>  lha5, 0(a1)
> @@ -181,13 +171,8 @@ const subpel_filters
>  .byte 0,  -1,  12, 123,  -6, 0
>  endconst
>
> -.macro epel len size type
> -func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> -.ifc \type,v
> -addit0, a6, -1
> -.else
> -addit0, a5, -1
> -.endif
> +.macro epel_common size, type
> +func ff_put_vp8_epel_\type\()\size\().rvv, zve32x
>  lla t2, subpel_filters
>  sh1add  t0, t0, t0
>  sh1add  t0, t0, t2
> @@ -198,7 +183,6 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  lb  t5, 5(t0)
>  lb  t0, (t0)
>  .endif
> -vsetvlstatic8   \len
>  1:
>  addia4, a4, -1
>  .ifc \type,v
> @@ -236,11 +220,11 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  vwmaccsu.vx v16, t1, v22
>  vwmaccsu.vx v16, t4, v28
>  vwadd.wxv16, v16, t6
> -vsetvlstatic16  \len
> +vsetvl  zero, zero, a6 # e16
>  vwadd.vvv24, v16, v20
>  vnsra.wiv24, v24, 7
>  vmax.vx v24, v24, zero
> -vsetvlstatic8   \len
> +vsetvl  zero, zero, a5 # e8
>  vnclipu.wi  v30, v24, 0
>  add a2, a2, a3
>  vse8.v  v30, (a0)
> @@ -251,9 +235,33 @@ func ff_put_vp8_epel\len\()_\type\()\size\()_rvv,
> zve32x
>  endfunc
>  .endm
>
> +.macro epel len, size, type
> +func ff_put_vp8_epel\len\()_\type\()\size\()_rvv, zve32x
> +.ifc \type,v
> +addit0, a6, -1
> +.else
> +addit0, a5, -1
> +.endif
> +.if \len <= 4
> +li  a5, 0306 # e8, mf4, ta, ma
> +li  a6, 0317 # e16, mf2, ta, ma
> +.elseif \len <= 8
> +li  a5, 0307 # e8, mf2, ta, ma
> +li  a6, 0310 # e16, m1, ta, ma
> +.else # if len <= 16
> +li  a5, 0300 # e8, m1, ta, ma
> +li  a6, 0311 # e16, m2, ta, ma
> +.endif
> +vsetvlstatic8 \len
> +j   ff_put_vp8_epel_\type\()\size\().rvv
> +endfunc
> +.endm
> +
> +.irp type,h,v
> +.irp size,4,6
> +epel_common \size, \type
>  .irp len,16,8,4
> -epel \len 6 h
> -epel \len 4 h
> -epel \len 6 v
> -epel \len 4 v
> +epel \len, \size, \type
> +.endr
> +.endr
>  .endr
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 5/5] lavc/vp8dsp: factor R-V V EPEL functions for all lengths

2024-05-25 Thread flow gg
Well, I'm mainly considering that we have added some vset related lines,
but they haven't played a new role for the time being. If it's for future
modifications, it does make sense.

> This is reducing code size by over 2 kib of code, or several hundreds of
instructions.

The reduction in code size seems to be due to switching to using j labels,
doesn't seem to be about vset, but another issue. j labels are indeed
better. I will make similar modifications.

Rémi Denis-Courmont  于2024年5月26日周日 02:29写道:

> Le lauantaina 25. toukokuuta 2024, 21.16.22 EEST flow gg a écrit :
> > Would it be better to replace the two vsetvlstatic8 and vsetvlstatic16
> with
> > two vsetvl?
>
> The other option is to hard-code the most pessimistic multiplier. That
> would
> be easier to read and save two instructions in the head, it would most
> likely
> end up slower overall, due to increased latency from the vector unit in
> the
> main loop.
>
> On the other hand, with vsetvl, we have the option to adjust the
> multiplier at
> run-time depending on hardware vector size. That will not be possible with
> vsetvli unless we patch the code live (yikes).
>
> > This would require the previous patch and this one to work
> > together,
>
> Yes, patch order matters.
>
> > increasing the number of lines of code
>
> This is reducing code size by over 2 kib of code, or several hundreds of
> instructions.
>
> > Additionally, I have a question about patch 4 'save one R-V GPR' and
> patch
> > 5. Should they be submitted as a single patch? Because patch 4 looks
> > similar to what I initially submitted, and you suggested changing it to
> > save lines of code. If it is only for patch 5, shouldn't they be combined
> > together?
>
> I think people here like to have as small and many patches as possible, as
> is
> generally considered the right way to use Git. Since patch 4 is a very
> minor
> but still independent (from patch 5) improvement, it should be separate,
> as
> far as I understand FFmpeg's practices.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg

2024-05-26 Thread flow gg
Hi, maybe we can prioritize this revert:
https://git.ffmpeg.org/gitweb/ffmpeg.git/commit/0c1304ae11b0361ede055ee8ffc6e83529468c73
Using [PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg to avoid conflicts with
other patches.

flow gg  于2024年5月24日周五 14:13写道:

> I want to update the VP9 bilin load, just like you did with VP8, but it
> seems like this patch([PATCH v2 1/5] lavc/vp9dsp: R-V V mc avg) doesn't
> merge the current updates here but merges the previous version instead, so
> the subsequent patches will have conflicts.
>
> flow gg  于2024年5月22日周三 01:15写道:
>
>> > Please put commas between operands.
>> > This should probably be ff_avg_vp9 or something slightly more specific.
>>
>> Updated here.
>>
>>  于2024年5月22日周三 01:14写道:
>>
>>> From: sunyuechi 
>>>
>>> C908:
>>> vp9_avg4_8bpp_c: 1.2
>>> vp9_avg4_8bpp_rvv_i64: 1.0
>>> vp9_avg8_8bpp_c: 3.7
>>> vp9_avg8_8bpp_rvv_i64: 1.5
>>> vp9_avg16_8bpp_c: 14.7
>>> vp9_avg16_8bpp_rvv_i64: 3.5
>>> vp9_avg32_8bpp_c: 57.7
>>> vp9_avg32_8bpp_rvv_i64: 10.0
>>> vp9_avg64_8bpp_c: 229.0
>>> vp9_avg64_8bpp_rvv_i64: 31.7
>>> ---
>>>  libavcodec/riscv/Makefile  |  3 +-
>>>  libavcodec/riscv/vp9_mc_rvv.S  | 58 ++
>>>  libavcodec/riscv/vp9dsp.h  |  4 +--
>>>  libavcodec/riscv/vp9dsp_init.c | 18 +++
>>>  4 files changed, 80 insertions(+), 3 deletions(-)
>>>  create mode 100644 libavcodec/riscv/vp9_mc_rvv.S
>>>
>>> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
>>> index 07d5c2915d..67e198d754 100644
>>> --- a/libavcodec/riscv/Makefile
>>> +++ b/libavcodec/riscv/Makefile
>>> @@ -69,6 +69,7 @@ RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>>>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>>>  RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \
>>>   riscv/vp9_mc_rvi.o
>>> -RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>>> +RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o \
>>> +  riscv/vp9_mc_rvv.o
>>>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>>>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
>>> diff --git a/libavcodec/riscv/vp9_mc_rvv.S
>>> b/libavcodec/riscv/vp9_mc_rvv.S
>>> new file mode 100644
>>> index 00..7cb38ec94a
>>> --- /dev/null
>>> +++ b/libavcodec/riscv/vp9_mc_rvv.S
>>> @@ -0,0 +1,58 @@
>>> +/*
>>> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
>>> (ISCAS).
>>> + *
>>> + * This file is part of FFmpeg.
>>> + *
>>> + * FFmpeg is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU Lesser General Public
>>> + * License as published by the Free Software Foundation; either
>>> + * version 2.1 of the License, or (at your option) any later version.
>>> + *
>>> + * FFmpeg is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * Lesser General Public License for more details.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with FFmpeg; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>> + */
>>> +
>>> +#include "libavutil/riscv/asm.S"
>>> +
>>> +.macro vsetvlstatic8 len an maxlen mn=m4
>>> +.if \len == 4
>>> +vsetivlizero, \len, e8, mf4, ta, ma
>>> +.elseif \len == 8
>>> +vsetivlizero, \len, e8, mf2, ta, ma
>>> +.elseif \len == 16
>>> +vsetivlizero, \len, e8, m1, ta, ma
>>> +.elseif \len == 32
>>> +li  \an, \len
>>> +vsetvli zero, \an, e8, m2, ta, ma
>>> +.elseif \len == 64
>>> +li  \an, \maxlen
>>> +vsetvli zero, \an, e8, \mn, ta, ma
>>> +.endif
>>> +.endm
>>> +
>>> +.macro copy_avg len
>>> +func ff_vp9_avg\len\()_rvv, zve32x
>>> +csrwi   vxrm, 0
>>> +vsetvlstatic8   \len, t0, 64
>>> +1:
>>> +vle8.v  v8, (a2)
>>> +vle8.v  

Re: [FFmpeg-devel] [PATCH v3 4/5] lavc/vp9dsp: R-V V mc tap h v

2024-05-29 Thread flow gg
A portion has been modified according to the previous review, but there are
still some parts that haven't been updated

> Similarly, it
> should be possible to share most of the horizontal and vertical code
(maybe
> also for bilinear. not just EPel) with separate load/store then inner
> procedures. The H.263 loop filter already does that though with almost no
> overhead, though
> H.263 is obviously simpler than VP9.
>
> A French philosopher famously said that Perfect is the ennemy of Good.
> Generally, as with VVC, nested repetition macros for finely specialised
> functions tend to generate way too much byte code, and this ends up being
> worse rather than better in the big picture.

Here, bilin is modified with reference to your vp8 modification method, but
there are some issues with epel. I want to share most of the horizontal and
vertical code like h263, but because there are different types
(op/name/len), such changes seem hard. Trying to make similar modifications
for bilin also seems some hard , maybe leaving it for future optimization
:'(

> It should be possible to spare one ADDI by using just AUIPC here, and
folding
> the immediate offset into the LB's below (see also H.263 loop filter).

I'm not sure where the problem lies, but for smooth it works, but for
sharp, regular, it gives this error:
dangerous relocation: %pcrel_lo overflow with an addend, the value of
%pcrel_hi is 0xa5000 without any addend, but may be 0xa6000 after adding
the %pcrel_lo addend

 于2024年5月30日周四 01:16写道:

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   13.0   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:5.04.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   13.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:5.04.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   49.5   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.28.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   66.5   45.0
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.7  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.2   18.7
> vp9_avg_8tap_smooth_16v_8bpp_c :  192.2  175.7
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.5   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.7
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   83.5   60.0
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.5  689.2
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.2   60.0
> vp9_avg_8tap_smooth_64h_8bpp_c : 3115.5 2647.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  283.5  119.2
> vp9_avg_8tap_smooth_64v_8bpp_c : 3082.2 2729.0
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  305.2  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.29.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.24.0
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.7
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.24.0
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   44.2   38.7
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_16h_8bpp_c :  165.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   19.5   17.5
> vp9_put_8tap_smooth_16v_8bpp_c :  169.0  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  659.7  586.7
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   64.2   57.2
> vp9_put_8tap_smooth_32v_8bpp_c :  680.5  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.2   57.2
> vp9_put_8tap_smooth_64h_8bpp_c : 2681.5 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  255.5  114.2
> vp9_put_8tap_smooth_64v_8bpp_c : 2709.7 2348.7
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 204 +
>  libavcodec/riscv/vp9dsp.h  |  72 
>  libavcodec/riscv/vp9dsp_init.c |  37 +-
>  3 files changed, 288 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 990271736b..53dd833dac 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1,

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-05-30 Thread flow gg
I directly copied the VP9 modifications over... Since len <= 16, it seems
like it can be improved a bit more

 于2024年5月30日周四 23:27写道:

> From: sunyuechi 
>
> Since len < 64, the registers are sufficient, so it can be
> directly unrolled (a4 is even).
>
> Another benefit of unrolling is that it reduces one load operation
> vertically compared to horizontally.
>
>  old new
>  C908   X60  C908   X60
> vp8_put_bilin4_h_c :6.25.5 :6.25.5
> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> vp8_put_bilin4_v_c :6.55.7 :6.25.7
> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>  1 file changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 3360a38cac..5bea6cba9c 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>  li  t4, 4
>  sub t1, t1, \mn
>  1:
> -addia4, a4, -1
> -bilin_load  v0, \type, \mn
> -vse8.v  v0, (a0)
> -add a2, a2, a3
> -add a0, a0, a1
> +add t0, a2, a3
> +add t2, a0, a1
> +addia4, a4, -2
> +.ifc \type,v
> +add t3, t0, a3
> +.else
> +addit5, a2, 1
> +addit3, t0, 1
> +vle8.v  v2, (t5)
> +.endif
> +vle8.v  v0, (a2)
> +vle8.v  v4, (t0)
> +vle8.v  v6, (t3)
> +vwmulu.vx   v28, v0, t1
> +vwmulu.vx   v26, v4, t1
> +.ifc \type,v
> +vwmaccu.vx  v28, \mn, v4
> +.else
> +vwmaccu.vx  v28, \mn, v2
> +.endif
> +vwmaccu.vx  v26, \mn, v6
> +vwaddu.wx   v24, v28, t4
> +vwaddu.wx   v22, v26, t4
> +vnsra.wiv30, v24, 3
> +vnsra.wiv0, v22, 3
> +vse8.v  v30, (a0)
> +vse8.v  v0, (t2)
> +add a2, t0, a3
> +add a0, t2, a1
>  bneza4, 1b
>
>  ret
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-05-30 Thread flow gg
Well.. because scalar registers are limited, the direct unrolling will be
like this for now. We can handle different lengths separately in the future

flow gg  于2024年5月30日周四 23:36写道:

> I directly copied the VP9 modifications over... Since len <= 16, it seems
> like it can be improved a bit more
>
>  于2024年5月30日周四 23:27写道:
>
>> From: sunyuechi 
>>
>> Since len < 64, the registers are sufficient, so it can be
>> directly unrolled (a4 is even).
>>
>> Another benefit of unrolling is that it reduces one load operation
>> vertically compared to horizontally.
>>
>>  old new
>>  C908   X60  C908   X60
>> vp8_put_bilin4_h_c :6.25.5 :6.25.5
>> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
>> vp8_put_bilin4_v_c :6.55.7 :6.25.7
>> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
>> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
>> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
>> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
>> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
>> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
>> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
>> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
>> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
>> ---
>>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>>  1 file changed, 29 insertions(+), 5 deletions(-)
>>
>> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
>> index 3360a38cac..5bea6cba9c 100644
>> --- a/libavcodec/riscv/vp8dsp_rvv.S
>> +++ b/libavcodec/riscv/vp8dsp_rvv.S
>> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>>  li  t4, 4
>>  sub t1, t1, \mn
>>  1:
>> -addia4, a4, -1
>> -bilin_load  v0, \type, \mn
>> -vse8.v  v0, (a0)
>> -add a2, a2, a3
>> -add a0, a0, a1
>> +add t0, a2, a3
>> +add t2, a0, a1
>> +addia4, a4, -2
>> +.ifc \type,v
>> +add t3, t0, a3
>> +.else
>> +addit5, a2, 1
>> +addit3, t0, 1
>> +vle8.v  v2, (t5)
>> +.endif
>> +vle8.v  v0, (a2)
>> +vle8.v  v4, (t0)
>> +vle8.v  v6, (t3)
>> +vwmulu.vx   v28, v0, t1
>> +vwmulu.vx   v26, v4, t1
>> +.ifc \type,v
>> +vwmaccu.vx  v28, \mn, v4
>> +.else
>> +vwmaccu.vx  v28, \mn, v2
>> +.endif
>> +vwmaccu.vx  v26, \mn, v6
>> +vwaddu.wx   v24, v28, t4
>> +vwaddu.wx   v22, v26, t4
>> +vnsra.wiv30, v24, 3
>> +vnsra.wiv0, v22, 3
>> +vse8.v  v30, (a0)
>> +vse8.v  v0, (t2)
>> +add a2, t0, a3
>> +add a0, t2, a1
>>  bneza4, 1b
>>
>>  ret
>> --
>> 2.45.1
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v2] lavc/vvc_mc: R-V V avg w_avg

2024-06-01 Thread flow gg
> In keeping in line with the rest of the project, that should probably go
into
> **libavcodec/riscv/vvc/**
> Expanding the macro 49 times, with up to 14 **branches** to get there is
maybe not
> such a great idea. It might look nice on the checkasm µbenchmarks because
the
> branches under test get predicted and cached.
>
> But in real use, branch prediction will not work so well, and the I-cache
will be filled with all variants of the same function.
>
> Indeed, this seems to result in about .5 MiB of code.
>
> Even if only one half is needed (128-bit or 256+-bit variants). that's a
lot.
>
> For comparison, x86 uses just about 10 KiB, also with two variants.
>
> What I make out from the arcane forbidden CISC arts there:
>
> - functions are specialised only in one dimension, not both,
> - dispatch tables avoid multiplying branches.

Referring to x86, the code has been updated. The current code size is 6k,
and a jmp table has been added.

 于2024年6月2日周日 02:01写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.01.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:1.72.0
> avg_8_2x4_rvv_i32  :1.21.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.5
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.2   15.0
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   46.7   44.2
> avg_8_2x64_rvv_i32 :   39.2   36.0
> avg_8_2x128_c  :   99.7   80.0
> avg_8_2x128_rvv_i32:   86.2   65.5
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.7
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.57.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.5   14.0
> avg_8_4x16_rvv_i32 :3.22.7
> avg_8_4x32_c   :   26.2   27.5
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   65.7
> avg_8_4x64_rvv_i32 :   44.0   32.0
> avg_8_4x128_c  :  165.2  118.5
> avg_8_4x128_rvv_i32:   81.5   71.0
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.5
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.5   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.0   52.5
> avg_8_8x32_rvv_i32 :6.75.2
> avg_8_8x64_c   :  120.7  119.0
> avg_8_8x64_rvv_i32 :   43.2   33.5
> avg_8_8x128_c  :  247.5  217.7
> avg_8_8x128_rvv_i32:  100.5   74.7
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.21.0
> avg_8_16x4_c   :   12.2   13.0
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.5   25.7
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   48.7   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.5  102.7
> avg_8_16x32_rvv_i32:   10.76.0
> avg_8_16x64_c  :  213.0  215.0
> avg_8_16x64_rvv_i32:   51.5   33.5
> avg_8_16x128_c :  408.5  417.0
> avg_8_16x128_rvv_i32   :  102.0   71.5
> avg_8_32x2_c   :   12.2   13.0
> av

Re: [FFmpeg-devel] [PATCH v2] lavc/vvc_mc: R-V V avg w_avg

2024-06-01 Thread flow gg
> I think we can drop the 2x2 transforms. In all likelihood, scalar code
will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.

I want to drop 2x2, but since there's only one function to handle all
situations instead of 7*7 functions, how can I drop only 2x2?

Rémi Denis-Courmont  于2024年6月2日周日 03:54写道:

> Le lauantaina 1. kesäkuuta 2024, 21.01.16 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >   C908   X60
> > avg_8_2x2_c:1.01.0
> > avg_8_2x2_rvv_i32  :1.01.0
>
> I think we can drop the 2x2 transforms. In all likelihood, scalar code
> will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.
>
> > avg_8_2x4_c:1.72.0
> > avg_8_2x4_rvv_i32  :1.21.2
> > avg_8_2x8_c:3.74.0
> > avg_8_2x8_rvv_i32  :2.02.0
> > avg_8_2x16_c   :7.27.5
> > avg_8_2x16_rvv_i32 :3.23.0
> > avg_8_2x32_c   :   14.2   15.0
> > avg_8_2x32_rvv_i32 :5.75.0
> > avg_8_2x64_c   :   46.7   44.2
> > avg_8_2x64_rvv_i32 :   39.2   36.0
> > avg_8_2x128_c  :   99.7   80.0
> > avg_8_2x128_rvv_i32:   86.2   65.5
> > avg_8_4x2_c:2.02.0
> > avg_8_4x2_rvv_i32  :1.01.0
> > avg_8_4x4_c:3.53.7
> > avg_8_4x4_rvv_i32  :1.51.2
> > avg_8_4x8_c:6.57.0
> > avg_8_4x8_rvv_i32  :2.01.7
> > avg_8_4x16_c   :   13.5   14.0
> > avg_8_4x16_rvv_i32 :3.22.7
> > avg_8_4x32_c   :   26.2   27.5
> > avg_8_4x32_rvv_i32 :5.75.0
> > avg_8_4x64_c   :   75.0   65.7
> > avg_8_4x64_rvv_i32 :   44.0   32.0
> > avg_8_4x128_c  :  165.2  118.5
> > avg_8_4x128_rvv_i32:   81.5   71.0
> > avg_8_8x2_c:3.23.5
> > avg_8_8x2_rvv_i32  :1.21.0
> > avg_8_8x4_c:6.56.5
> > avg_8_8x4_rvv_i32  :1.51.5
> > avg_8_8x8_c:   12.5   13.2
> > avg_8_8x8_rvv_i32  :2.21.7
> > avg_8_8x16_c   :   25.2   26.5
> > avg_8_8x16_rvv_i32 :3.72.7
> > avg_8_8x32_c   :   50.0   52.5
> > avg_8_8x32_rvv_i32 :6.75.2
> > avg_8_8x64_c   :  120.7  119.0
> > avg_8_8x64_rvv_i32 :   43.2   33.5
> > avg_8_8x128_c  :  247.5  217.7
> > avg_8_8x128_rvv_i32:  100.5   74.7
> > avg_8_16x2_c   :6.26.5
> > avg_8_16x2_rvv_i32 :1.21.0
> > avg_8_16x4_c   :   12.2   13.0
> > avg_8_16x4_rvv_i32 :2.01.2
> > avg_8_16x8_c   :   24.5   25.7
> > avg_8_16x8_rvv_i32 :3.22.0
> > avg_8_16x16_c  :   48.7   51.2
> > avg_8_16x16_rvv_i32:5.73.2
> > avg_8_16x32_c  :   97.5  102.7
> > avg_8_16x32_rvv_i32:   10.76.0
> > avg_8_16x64_c  :  213.0  215.0
> > avg_8_16x64_rvv_i32:   51.5   33.5
> > avg_8_16x128_c :  408.5  417.0
> > avg_8_16x128_rvv_i32   :  102.0   71.5
> > avg_8_32x2_c   :   12.2   13.0
> > avg_8_32x2_rvv_i32 :2.01.2
> > avg_8_32x4_c   :   24.5   25.5
> > avg_8_32x4_rvv_i32 :3.21.7
> > avg_8_32x8_c 

Re: [FFmpeg-devel] [PATCH v3] lavc/vvc_mc: R-V V avg w_avg

2024-06-11 Thread flow gg
> I think we can drop the 2x2 transforms. In all likelihood, scalar code
will
> end up faster than vector code on future hardware, especially out-of-order
> pipelines.

I want to drop 2x2, but since there's only one function to handle all
situations instead of 7*7 functions..

> AFAIU, this will generate relocations. I wonder if the linker smart
enough to
> put that into .data.relro rather than whine that it can't live it in
.rodata?
>
> In assembler, we can dodge the problem entirely by storing relative
offsets
> rather than addresses. You can also stick to 4- or even 2-byte values
then.

Okay, updated it in the reply

> LLA is an alias for AUIPC; ADD. You can avoid that ADD by folding the low
bits
> into LD. See how ff_h263_loop_filter_strength is addressed in
h263dsp_rvv.S.

With the previous change to use relative offsets in the table,
it seems that the full table start address needs to be stored in a register
once,
so it appears that this situation requires the use of lla.

 于2024年6月12日周三 00:38写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32   :   97.2   77.0
> avg_8_32x2_c  

Re: [FFmpeg-devel] [PATCH v4] lavc/vvc_mc: R-V V avg w_avg

2024-06-11 Thread flow gg
> Nit: for overall code base consistency, I'd use csrwi here. Reason being
that
> for other rounding modes, csrwi is the better option.
>
> Probably faster to swap the two above, to avoid stalling on LD.
>
> If you check more than one length, better to get ff_get_rv_vlenb() into a
local
> variable.
>
> In C, it would be invalid pointer arithmetic, but in assembler, you can
add
> whatever constant offset you want to this symbol, even if points outside
the
> table. So you should be able to eliminate the LI above. It won't make much
> difference though.

Okay, updated them in the reply

> Could SEW be a parameter so that these three macros would be a little bit
more
> factored? .ifc / .ifnc might help to match e8/e16/e32.

I feel this makes the vset overly complex, and adding more if-else
statements doesn't significantly reduce the amount of code..

> I guess t4 is 32-bit? Kinda sad to switch VTYPE just for this but if so, I
> don't have any better idea :(

Yes, t4 is 32-bit. I've considered this and haven't found a better
solution. :(

> Is that .rept meaningfully faster than a run-time loop?

I haven't done a direct comparison.. it's just to reduce a few comparisons.

 于2024年6月12日周三 02:38写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32  

Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-06-12 Thread flow gg
ping

 于2024年5月30日周四 23:27写道:

> From: sunyuechi 
>
> Since len < 64, the registers are sufficient, so it can be
> directly unrolled (a4 is even).
>
> Another benefit of unrolling is that it reduces one load operation
> vertically compared to horizontally.
>
>  old new
>  C908   X60  C908   X60
> vp8_put_bilin4_h_c :6.25.5 :6.25.5
> vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> vp8_put_bilin4_v_c :6.55.7 :6.25.7
> vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> ---
>  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
>  1 file changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 3360a38cac..5bea6cba9c 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
>  li  t4, 4
>  sub t1, t1, \mn
>  1:
> -addia4, a4, -1
> -bilin_load  v0, \type, \mn
> -vse8.v  v0, (a0)
> -add a2, a2, a3
> -add a0, a0, a1
> +add t0, a2, a3
> +add t2, a0, a1
> +addia4, a4, -2
> +.ifc \type,v
> +add t3, t0, a3
> +.else
> +addit5, a2, 1
> +addit3, t0, 1
> +vle8.v  v2, (t5)
> +.endif
> +vle8.v  v0, (a2)
> +vle8.v  v4, (t0)
> +vle8.v  v6, (t3)
> +vwmulu.vx   v28, v0, t1
> +vwmulu.vx   v26, v4, t1
> +.ifc \type,v
> +vwmaccu.vx  v28, \mn, v4
> +.else
> +vwmaccu.vx  v28, \mn, v2
> +.endif
> +vwmaccu.vx  v26, \mn, v6
> +vwaddu.wx   v24, v28, t4
> +vwaddu.wx   v22, v26, t4
> +vnsra.wiv30, v24, 3
> +vnsra.wiv0, v22, 3
> +vse8.v  v30, (a0)
> +vse8.v  v0, (t2)
> +add a2, t0, a3
> +add a0, t2, a1
>  bneza4, 1b
>
>  ret
> --
> 2.45.1
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_bilin_h v unroll

2024-06-12 Thread flow gg
> Does this not render the type parameter of bilin_load useless (always h)?
> (Not a blocker for this patch.)

Yes, this was needed in the initial version, but it is no longer required.
I just sent a patch.

> Not sure if I already asked this but is this really faster than slide1?
> Normally we want to minimise the work of the memory bus.

Originally it was slide, but based on your review, it was changed to load,
which should be better.

review: "Can't we skip the slide and just load the vector at a2+1? Also
then, we can keep VL=len and halve the multipler."

Rémi Denis-Courmont  于2024年6月12日周三 22:41写道:

> Le torstaina 30. toukokuuta 2024, 18.26.53 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > Since len < 64, the registers are sufficient, so it can be
> > directly unrolled (a4 is even).
> >
> > Another benefit of unrolling is that it reduces one load operation
> > vertically compared to horizontally.
> >
> >  old new
> >  C908   X60  C908   X60
> > vp8_put_bilin4_h_c :6.25.5 :6.25.5
> > vp8_put_bilin4_h_rvv_i32   :2.22.0 :1.51.5
> > vp8_put_bilin4_v_c :6.55.7 :6.25.7
> > vp8_put_bilin4_v_rvv_i32   :2.22.0 :1.21.5
> > vp8_put_bilin8_h_c :   24.2   21.5 :   24.2   21.5
> > vp8_put_bilin8_h_rvv_i32   :5.24.7 :3.53.5
> > vp8_put_bilin8_v_c :   24.5   21.7 :   24.5   21.7
> > vp8_put_bilin8_v_rvv_i32   :5.24.7 :3.53.2
> > vp8_put_bilin16_h_c:   48.0   42.7 :   48.0   42.7
> > vp8_put_bilin16_h_rvv_i32  :5.75.0 :5.24.5
> > vp8_put_bilin16_v_c:   48.2   43.0 :   48.2   42.7
> > vp8_put_bilin16_v_rvv_i32  :5.75.2 :4.54.2
> > ---
> >  libavcodec/riscv/vp8dsp_rvv.S | 34 +-
> >  1 file changed, 29 insertions(+), 5 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index 3360a38cac..5bea6cba9c 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -172,11 +172,35 @@ func ff_put_vp8_bilin4_\type\()_rvv, zve32x
> >  li  t4, 4
> >  sub t1, t1, \mn
> >  1:
> > -addia4, a4, -1
> > -bilin_load  v0, \type, \mn
>
> Does this not render the type parameter of bilin_load useless (always h)?
> (Not a blocker for this patch.)
>
> > -vse8.v  v0, (a0)
> > -add a2, a2, a3
> > -add a0, a0, a1
> > +add t0, a2, a3
> > +add t2, a0, a1
> > +addia4, a4, -2
> > +.ifc \type,v
> > +add t3, t0, a3
> > +.else
> > +addit5, a2, 1
> > +addit3, t0, 1
> > +vle8.v  v2, (t5)
>
> Not sure if I already asked this but is this really faster than slide1?
> Normally we want to minimise the work of the memory bus.
>
> > +.endif
> > +vle8.v  v0, (a2)
> > +vle8.v  v4, (t0)
> > +vle8.v  v6, (t3)
> > +vwmulu.vx   v28, v0, t1
> > +vwmulu.vx   v26, v4, t1
> > +.ifc \type,v
> > +vwmaccu.vx  v28, \mn, v4
> > +.else
> > +vwmaccu.vx  v28, \mn, v2
> > +.endif
> > +vwmaccu.vx  v26, \mn, v6
> > +vwaddu.wx   v24, v28, t4
> > +vwaddu.wx   v22, v26, t4
> > +vnsra.wiv30, v24, 3
> > +vnsra.wiv0, v22, 3
> > +vse8.v  v30, (a0)
> > +vse8.v  v0, (t2)
> > +add a2, t0, a3
> > +add a0, t2, a1
> >  bneza4, 1b
> >
> >  ret
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH v4 1/4] lavc/vp9dsp: R-V V mc bilin h v

2024-06-15 Thread flow gg
Just like in VP8, the unroll has been updated.

 于2024年6月15日周六 19:51写道:

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4h_8bpp_c:5.54.7
> vp9_avg_bilin_4h_8bpp_rvv_i32  :1.71.5
> vp9_avg_bilin_4v_8bpp_c:5.54.7
> vp9_avg_bilin_4v_8bpp_rvv_i32  :1.51.2
> vp9_avg_bilin_8h_8bpp_c:   20.0   17.7
> vp9_avg_bilin_8h_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_8v_8bpp_c:   20.7   18.7
> vp9_avg_bilin_8v_8bpp_rvv_i32  :3.02.7
> vp9_avg_bilin_16h_8bpp_c   :   78.2   69.7
> vp9_avg_bilin_16h_8bpp_rvv_i32 :7.06.2
> vp9_avg_bilin_16v_8bpp_c   :   98.5   73.2
> vp9_avg_bilin_16v_8bpp_rvv_i32 :7.06.0
> vp9_avg_bilin_32h_8bpp_c   :  325.5  275.5
> vp9_avg_bilin_32h_8bpp_rvv_i32 :   23.0   20.5
> vp9_avg_bilin_32v_8bpp_c   :  342.2  290.0
> vp9_avg_bilin_32v_8bpp_rvv_i32 :   21.7   19.5
> vp9_avg_bilin_64h_8bpp_c   : 1263.7 1095.7
> vp9_avg_bilin_64h_8bpp_rvv_i32 :   91.2   81.2
> vp9_avg_bilin_64v_8bpp_c   : 1331.7 1155.2
> vp9_avg_bilin_64v_8bpp_rvv_i32 :   91.2   81.0
> vp9_put_bilin_4h_8bpp_c:4.54.0
> vp9_put_bilin_4h_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_4v_8bpp_c:4.74.2
> vp9_put_bilin_4v_8bpp_rvv_i32  :1.01.0
> vp9_put_bilin_8h_8bpp_c:   16.7   15.0
> vp9_put_bilin_8h_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_8v_8bpp_c:   17.5   15.7
> vp9_put_bilin_8v_8bpp_rvv_i32  :2.22.0
> vp9_put_bilin_16h_8bpp_c   :   65.2   58.0
> vp9_put_bilin_16h_8bpp_rvv_i32 :6.05.5
> vp9_put_bilin_16v_8bpp_c   :   69.2   61.7
> vp9_put_bilin_16v_8bpp_rvv_i32 :5.75.2
> vp9_put_bilin_32h_8bpp_c   :  273.2  229.0
> vp9_put_bilin_32h_8bpp_rvv_i32 :   19.7   17.7
> vp9_put_bilin_32v_8bpp_c   :  290.5  243.7
> vp9_put_bilin_32v_8bpp_rvv_i32 :   18.7   16.7
> vp9_put_bilin_64h_8bpp_c   : 1040.5  910.5
> vp9_put_bilin_64h_8bpp_rvv_i32 :   82.5   73.0
> vp9_put_bilin_64v_8bpp_c   : 1108.5  971.0
> vp9_put_bilin_64v_8bpp_rvv_i32 :   82.2   73.2
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 114 +
>  libavcodec/riscv/vp9dsp.h  |  12 ++--
>  libavcodec/riscv/vp9dsp_init.c |  21 ++
>  3 files changed, 141 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 7cb38ec94a..fb7377048a 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -53,6 +53,120 @@ func ff_vp9_avg\len\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_load_h dst, op, mn
> +addit5, a2, 1
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv \dst, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   \dst, \dst, v16
> +.endif
> +.endm
> +
> +.macro bilin_h_v op, type, mn
> +func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> +vsetvlstatic8   64, t0, 64
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn
> +1:
> +addia4, a4, -1
> +.ifc \type,v
> +add t5, a2, a3
> +.else
> +addit5, a2, 1
> +.endif
> +vle8.v  v8, (a2)
> +vle8.v  v0, (t5)
> +vwmulu.vx   v16, v0, \mn
> +vwmaccsu.vx v16, t1, v8
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v8
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +ret
> +
> +.Lbilin_\type\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +li  t4, 8
> +neg t1, \mn
> +1:
> +addi   

Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv

2024-06-15 Thread flow gg
> Copying vectors is rarely justified - mostly only before destructive
> instructions such as FMA.

It is slightly different from VP8. In VP8, many scalar values are positive,
so the related calculations can be easily replaced. However, in this
context of VP9, since t2 is a negative number, vwmaccsu is required.
Therefore, unlike the logic in VP8, we cannot use vwmulu.vx before
bilin_load to avoid vmv.


 于2024年6月15日周六 19:51写道:

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_bilin_4hv_8bpp_c   :   10.79.5
> vp9_avg_bilin_4hv_8bpp_rvv_i32 :4.03.5
> vp9_avg_bilin_8hv_8bpp_c   :   38.5   34.2
> vp9_avg_bilin_8hv_8bpp_rvv_i32 :7.26.5
> vp9_avg_bilin_16hv_8bpp_c  :  147.2  130.5
> vp9_avg_bilin_16hv_8bpp_rvv_i32:   14.5   12.7
> vp9_avg_bilin_32hv_8bpp_c  :  574.2  509.7
> vp9_avg_bilin_32hv_8bpp_rvv_i32:   42.5   38.0
> vp9_avg_bilin_64hv_8bpp_c  : 2321.2 2017.7
> vp9_avg_bilin_64hv_8bpp_rvv_i32:  163.5  131.0
> vp9_put_bilin_4hv_8bpp_c   :   10.08.7
> vp9_put_bilin_4hv_8bpp_rvv_i32 :3.53.0
> vp9_put_bilin_8hv_8bpp_c   :   35.2   31.2
> vp9_put_bilin_8hv_8bpp_rvv_i32 :6.55.7
> vp9_put_bilin_16hv_8bpp_c  :  134.0  119.0
> vp9_put_bilin_16hv_8bpp_rvv_i32:   12.7   11.5
> vp9_put_bilin_32hv_8bpp_c  :  538.5  464.2
> vp9_put_bilin_32hv_8bpp_rvv_i32:   39.7   35.2
> vp9_put_bilin_64hv_8bpp_c  : 2111.7 1833.2
> vp9_put_bilin_64hv_8bpp_rvv_i32:  138.5  122.5
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 38 +-
>  libavcodec/riscv/vp9dsp_init.c | 10 +
>  2 files changed, 47 insertions(+), 1 deletion(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index fb7377048a..5241562531 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
>  endfunc
>  .endm
>
> +.macro bilin_hv op
> +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> +vsetvlstatic8   64, t0, 64
> +.Lbilin_hv\op:
> +.ifc \op,avg
> +csrwi   vxrm, 0
> +.endif
> +neg t1, a5
> +neg t2, a6
> +li  t4, 8
> +bilin_load_hv24, put, a5
> +add a2, a2, a3
> +1:
> +addia4, a4, -1
> +bilin_load_hv4, put, a5
> +vwmulu.vx   v16, v4, a6
> +vwmaccsu.vx v16, t2, v24
> +vwadd.wxv16, v16, t4
> +vnsra.wiv16, v16, 4
> +vadd.vv v0, v16, v24
> +.ifc \op,avg
> +vle8.v  v16, (a0)
> +vaaddu.vv   v0, v0, v16
> +.endif
> +vse8.v  v0, (a0)
> +vmv.v.v v24, v4
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
>  .irp len, 64, 32, 16, 8, 4
>  copy_avg \len
>  .endr
> @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
>  bilin_h_v  avg, h, a5
>  bilin_h_v  put, v, a6
>  bilin_h_v  avg, v, a6
> +bilin_hv   put
> +bilin_hv   avg
>
>  .macro func_bilin_h_v len, op, type
>  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> @@ -165,7 +201,7 @@ endfunc
>
>  .irp len, 32, 16, 8, 4
>  .irp op, put, avg
> -.irp type, h, v
> +.irp type, h, v, hv
>  func_bilin_h_v \len, \op, \type
>  .endr
>  .endr
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 9606d8545f..b3700dfb08 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> *dsp, int bpp)
>  dsp->mc[4][FILTER_BILINEAR ][0][1][0] = ff_put_vp9_bilin_4h_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][0][1] = ff_avg_vp9_bilin_4v_rvv;
>  dsp->mc[4][FILTER_BILINEAR ][1][1][0] = ff_avg_vp9_bilin_4h_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_64hv_rvv;
> +dsp->mc[0][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_64hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_32hv_rvv;
> +dsp->mc[1][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_32hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_16hv_rvv;
> +dsp->mc[2][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bilin_16hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][0][1][1] = ff_put_vp9_bilin_8hv_rvv;
> +dsp->mc[3][FILTER_BILINEAR ][1][1][1] = ff_avg_vp9_bil

Re: [FFmpeg-devel] [PATCH v4 3/4] lavc/vp9dsp: R-V V mc tap h v

2024-06-15 Thread flow gg
> You can directly LLA filters + 16 * 8 * 2 and save one add. Same below.
You can
> also use .equ to alias the filter addresses, and avoid if's.

> That's a lot of address dependencies, which is going to hurt performance.
It
> might help to just spill more S registers if needed.

> This can be done in 3 instructions, even without mul. Of course you'll
again
> need a spare register.

Okay, updated them

> Use a macro parameter for the stride register.

Doing this will reduce one if-else statement in this patch, but in the next
patch, it will lead to adding multiple if-else statements. I think we can
leave it unchanged.

 于2024年6月15日周六 19:51写道:

> From: sunyuechi 
>
>  C908   X60
> vp9_avg_8tap_smooth_4h_8bpp_c  :   12.7   11.2
> vp9_avg_8tap_smooth_4h_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_4v_8bpp_c  :   29.7   12.5
> vp9_avg_8tap_smooth_4v_8bpp_rvv_i32:4.74.2
> vp9_avg_8tap_smooth_8h_8bpp_c  :   48.7   42.2
> vp9_avg_8tap_smooth_8h_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_8v_8bpp_c  :   49.7   45.5
> vp9_avg_8tap_smooth_8v_8bpp_rvv_i32:9.58.5
> vp9_avg_8tap_smooth_16h_8bpp_c :  192.0  166.5
> vp9_avg_8tap_smooth_16h_8bpp_rvv_i32   :   21.7   19.5
> vp9_avg_8tap_smooth_16v_8bpp_c :  191.2  175.2
> vp9_avg_8tap_smooth_16v_8bpp_rvv_i32   :   21.2   19.0
> vp9_avg_8tap_smooth_32h_8bpp_c :  780.2  663.2
> vp9_avg_8tap_smooth_32h_8bpp_rvv_i32   :   68.2   60.5
> vp9_avg_8tap_smooth_32v_8bpp_c :  770.0  685.7
> vp9_avg_8tap_smooth_32v_8bpp_rvv_i32   :   67.0   59.5
> vp9_avg_8tap_smooth_64h_8bpp_c : 3116.2 2648.2
> vp9_avg_8tap_smooth_64h_8bpp_rvv_i32   :  270.7  120.7
> vp9_avg_8tap_smooth_64v_8bpp_c : 3058.5 2731.7
> vp9_avg_8tap_smooth_64v_8bpp_rvv_i32   :  266.5  119.0
> vp9_put_8tap_smooth_4h_8bpp_c  :   11.09.7
> vp9_put_8tap_smooth_4h_8bpp_rvv_i32:4.23.7
> vp9_put_8tap_smooth_4v_8bpp_c  :   11.7   10.5
> vp9_put_8tap_smooth_4v_8bpp_rvv_i32:4.03.7
> vp9_put_8tap_smooth_8h_8bpp_c  :   42.0   37.5
> vp9_put_8tap_smooth_8h_8bpp_rvv_i32:8.57.7
> vp9_put_8tap_smooth_8v_8bpp_c  :   43.5   38.5
> vp9_put_8tap_smooth_8v_8bpp_rvv_i32:8.77.7
> vp9_put_8tap_smooth_16h_8bpp_c :  181.7  147.2
> vp9_put_8tap_smooth_16h_8bpp_rvv_i32   :   20.0   18.0
> vp9_put_8tap_smooth_16v_8bpp_c :  168.5  149.7
> vp9_put_8tap_smooth_16v_8bpp_rvv_i32   :   19.7   17.5
> vp9_put_8tap_smooth_32h_8bpp_c :  675.0  586.5
> vp9_put_8tap_smooth_32h_8bpp_rvv_i32   :   65.2   58.0
> vp9_put_8tap_smooth_32v_8bpp_c :  664.7  591.2
> vp9_put_8tap_smooth_32v_8bpp_rvv_i32   :   64.0   57.0
> vp9_put_8tap_smooth_64h_8bpp_c : 2696.2 2339.0
> vp9_put_8tap_smooth_64h_8bpp_rvv_i32   :  259.7  115.7
> vp9_put_8tap_smooth_64v_8bpp_c : 2691.0 2348.5
> vp9_put_8tap_smooth_64v_8bpp_rvv_i32   :  255.5  114.0
> ---
>  libavcodec/riscv/vp9_mc_rvv.S  | 200 +
>  libavcodec/riscv/vp9dsp.h  |  72 
>  libavcodec/riscv/vp9dsp_init.c |  38 ++-
>  3 files changed, 285 insertions(+), 25 deletions(-)
>
> diff --git a/libavcodec/riscv/vp9_mc_rvv.S b/libavcodec/riscv/vp9_mc_rvv.S
> index 5241562531..5e81301aa5 100644
> --- a/libavcodec/riscv/vp9_mc_rvv.S
> +++ b/libavcodec/riscv/vp9_mc_rvv.S
> @@ -36,6 +36,18 @@
>  .endif
>  .endm
>
> +.macro vsetvlstatic16 len
> +.ifc \len,4
> +vsetvli zero, zero, e16, mf2, ta, ma
> +.elseif \len == 8
> +vsetvli zero, zero, e16, m1, ta, ma
> +.elseif \len == 16
> +vsetvli zero, zero, e16, m2, ta, ma
> +.else
> +vsetvli zero, zero, e16, m4, ta, ma
> +.endif
> +.endm
> +
>  .macro copy_avg len
>  func ff_vp9_avg\len\()_rvv, zve32x
>  csrwi   vxrm, 0
> @@ -181,8 +193,196 @@ func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
>  endfunc
>  .endm
>
> +.equ ff_vp9_subpel_filters_smooth, ff_vp9_subpel_filters
> +.equ ff_vp9_subpel_filters_regular, ff_vp9_subpel_filters + 16*8*2
> +.equ ff_vp9_subpel_filters_sharp, ff_vp9_subpel_filters + 16*8*2*2
> +
> +.macro epel_filter name, type, regtype
> +lla \regtype\()2, ff_vp9_subpel_filters_\name
> +
> +.ifc \type,v
> +slli\regtype\()0, a6, 4
> +.else
> +slli\regtype\()0, a5, 4
> +.endif
> +add \regtype\()0, \regtype\()0, \r

Re: [FFmpeg-devel] [PATCH v4 2/4] lavc/vp9dsp: R-V V mc bilin hv

2024-06-30 Thread flow gg
Initially, I tried using `vnclip.wi` with reference to h264,
-vwadd.wxv16, v16, t4
-vnsra.wiv16, v16, 4
+vnclip.wi   v16, v16, 4

but couldn't find the correct way... I think there might be some overflow
issues that I didn't understand correctly. How do you think it should be
replaced?

Rémi Denis-Courmont  于2024年6月25日周二 04:07写道:

> Le lauantaina 15. kesäkuuta 2024, 14.50.32 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> >  C908   X60
> > vp9_avg_bilin_4hv_8bpp_c   :   10.79.5
> > vp9_avg_bilin_4hv_8bpp_rvv_i32 :4.03.5
> > vp9_avg_bilin_8hv_8bpp_c   :   38.5   34.2
> > vp9_avg_bilin_8hv_8bpp_rvv_i32 :7.26.5
> > vp9_avg_bilin_16hv_8bpp_c  :  147.2  130.5
> > vp9_avg_bilin_16hv_8bpp_rvv_i32:   14.5   12.7
> > vp9_avg_bilin_32hv_8bpp_c  :  574.2  509.7
> > vp9_avg_bilin_32hv_8bpp_rvv_i32:   42.5   38.0
> > vp9_avg_bilin_64hv_8bpp_c  : 2321.2 2017.7
> > vp9_avg_bilin_64hv_8bpp_rvv_i32:  163.5  131.0
> > vp9_put_bilin_4hv_8bpp_c   :   10.08.7
> > vp9_put_bilin_4hv_8bpp_rvv_i32 :3.53.0
> > vp9_put_bilin_8hv_8bpp_c   :   35.2   31.2
> > vp9_put_bilin_8hv_8bpp_rvv_i32 :6.55.7
> > vp9_put_bilin_16hv_8bpp_c  :  134.0  119.0
> > vp9_put_bilin_16hv_8bpp_rvv_i32:   12.7   11.5
> > vp9_put_bilin_32hv_8bpp_c  :  538.5  464.2
> > vp9_put_bilin_32hv_8bpp_rvv_i32:   39.7   35.2
> > vp9_put_bilin_64hv_8bpp_c  : 2111.7 1833.2
> > vp9_put_bilin_64hv_8bpp_rvv_i32:  138.5  122.5
> > ---
> >  libavcodec/riscv/vp9_mc_rvv.S  | 38 +-
> >  libavcodec/riscv/vp9dsp_init.c | 10 +
> >  2 files changed, 47 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavcodec/riscv/vp9_mc_rvv.S
> b/libavcodec/riscv/vp9_mc_rvv.S
> > index fb7377048a..5241562531 100644
> > --- a/libavcodec/riscv/vp9_mc_rvv.S
> > +++ b/libavcodec/riscv/vp9_mc_rvv.S
> > @@ -147,6 +147,40 @@ func ff_\op\()_vp9_bilin_64\type\()_rvv, zve32x
> >  endfunc
> >  .endm
> >
> > +.macro bilin_hv op
> > +func ff_\op\()_vp9_bilin_64hv_rvv, zve32x
> > +vsetvlstatic8   64, t0, 64
> > +.Lbilin_hv\op:
> > +.ifc \op,avg
> > +csrwi   vxrm, 0
> > +.endif
> > +neg t1, a5
> > +neg t2, a6
> > +li  t4, 8
> > +bilin_load_hv24, put, a5
> > +add a2, a2, a3
> > +1:
> > +addia4, a4, -1
> > +bilin_load_hv4, put, a5
> > +vwmulu.vx   v16, v4, a6
> > +vwmaccsu.vx v16, t2, v24
> > +vwadd.wxv16, v16, t4
> > +vnsra.wiv16, v16, 4
>
> Why round manually?
> It looks like vnclip.wi would be more straightforward here.
>
> > +vadd.vv v0, v16, v24
> > +.ifc \op,avg
> > +vle8.v  v16, (a0)
> > +vaaddu.vv   v0, v0, v16
> > +.endif
> > +vse8.v  v0, (a0)
> > +vmv.v.v v24, v4
> > +add a2, a2, a3
> > +add a0, a0, a1
> > +bneza4, 1b
> > +
> > +ret
> > +endfunc
> > +.endm
> > +
> >  .irp len, 64, 32, 16, 8, 4
> >  copy_avg \len
> >  .endr
> > @@ -155,6 +189,8 @@ bilin_h_v  put, h, a5
> >  bilin_h_v  avg, h, a5
> >  bilin_h_v  put, v, a6
> >  bilin_h_v  avg, v, a6
> > +bilin_hv   put
> > +bilin_hv   avg
> >
> >  .macro func_bilin_h_v len, op, type
> >  func ff_\op\()_vp9_bilin_\len\()\type\()_rvv, zve32x
> > @@ -165,7 +201,7 @@ endfunc
> >
> >  .irp len, 32, 16, 8, 4
> >  .irp op, put, avg
> > -.irp type, h, v
> > +.irp type, h, v, hv
> >  func_bilin_h_v \len, \op, \type
> >  .endr
> >  .endr
> > diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> > index 9606d8545f..b3700dfb08 100644
> > --- a/libavcodec/riscv/vp9dsp_init.c
> > +++ b/libavcodec/riscv/vp9dsp_init.c
> > @@ -83,6 +83,16 @@ static av_cold void vp9dsp_mc_init_riscv(VP9DSPContext
> > *dsp, int bpp) dsp->mc[4][FILTER_BILINEAR ][0][1][0] =
> > ff_put_vp9_bilin_4h_rvv; dsp->mc[4][FILTER_BILINEAR ][1][0][1] =
> > ff_avg_vp9_bilin_4v_rvv; dsp->mc[4][FILTER_BILINEAR ][1][1][0] =
> > ff_avg_vp9_bilin_4h_rvv; +dsp->mc[0][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_64hv_rvv; +dsp->mc[0][FILTER_BILINEAR ][1][1][1] =
> > ff_avg_vp9_bilin_64hv_rvv; +dsp->mc[1][FILTER_BILINEAR ][0][1][1] =
> > ff_put_vp9_bilin_32hv_rvv; +dsp->mc[1][FILTER_BILINEAR ][1][1][1] =
> > ff_av

Re: [FFmpeg-devel] [PATCH 2/2] lavc/h264dsp: R-V V 8-bit luma loop filter

2024-07-01 Thread flow gg
The loop filter horizontal in vp8 also has this issue ..

Rémi Denis-Courmont  于2024年6月30日周日 17:04写道:

> T-Head C908 (cycles):
> h264_h_loop_filter_luma_8bpp_c:   297.5
> h264_h_loop_filter_luma_8bpp_rvv_i32: 374.7
> h264_v_loop_filter_luma_8bpp_c:   862.7
> h264_v_loop_filter_luma_8bpp_rvv_i32: 200.7
>
> Performance in the horizontal scenario seems worse than scalar. x86
> SSE2 and AVX optimisations are similarly affected. This is presumably
> caused by unlucky inputs from checkasm, such that the C code
> short-circuits almost all filter calculations.
> ---
>  libavcodec/riscv/Makefile   |   1 +
>  libavcodec/riscv/h264dsp_init.c |  13 ++-
>  libavcodec/riscv/h264dsp_rvv.S  | 136 
>  3 files changed, 149 insertions(+), 1 deletion(-)
>  create mode 100644 libavcodec/riscv/h264dsp_rvv.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index c180223141..a1510e8c6e 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -31,6 +31,7 @@ RVV-OBJS-$(CONFIG_H263DSP) += riscv/h263dsp_rvv.o
>  OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
>  RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
>  OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o
> +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_rvv.o
>  OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o
>  RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o
>  OBJS-$(CONFIG_IDCTDSP) += riscv/idctdsp_init.o
> diff --git a/libavcodec/riscv/h264dsp_init.c
> b/libavcodec/riscv/h264dsp_init.c
> index dbbf3db400..0d4d541992 100644
> --- a/libavcodec/riscv/h264dsp_init.c
> +++ b/libavcodec/riscv/h264dsp_init.c
> @@ -24,8 +24,14 @@
>
>  #include "libavutil/attributes.h"
>  #include "libavutil/cpu.h"
> +#include "libavutil/riscv/cpu.h"
>  #include "libavcodec/h264dsp.h"
>
> +void ff_h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
> +  int alpha, int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride,
> +  int alpha, int beta, int8_t *tc0);
> +
>  extern int ff_startcode_find_candidate_rvb(const uint8_t *, int);
>  extern int ff_startcode_find_candidate_rvv(const uint8_t *, int);
>
> @@ -38,8 +44,13 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp,
> const int bit_depth,
>  if (flags & AV_CPU_FLAG_RVB_BASIC)
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvb;
>  # if HAVE_RVV
> -if (flags & AV_CPU_FLAG_RVV_I32)
> +if (flags & AV_CPU_FLAG_RVV_I32) {
> +if (bit_depth == 8 && ff_rv_vlen_least(128)) {
> +dsp->h264_v_loop_filter_luma =
> ff_h264_v_loop_filter_luma_8_rvv;
> +dsp->h264_h_loop_filter_luma =
> ff_h264_h_loop_filter_luma_8_rvv;
> +}
>  dsp->startcode_find_candidate = ff_startcode_find_candidate_rvv;
> +}
>  # endif
>  #endif
>  }
> diff --git a/libavcodec/riscv/h264dsp_rvv.S
> b/libavcodec/riscv/h264dsp_rvv.S
> new file mode 100644
> index 00..ea9dfb1a7e
> --- /dev/null
> +++ b/libavcodec/riscv/h264dsp_rvv.S
> @@ -0,0 +1,136 @@
> +/*
> + * Copyright © 2024 Rémi Denis-Courmont.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are
> met:
> + *
> + * 1. Redistributions of source code must retain the above copyright
> notice,
> + *this list of conditions and the following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above copyright
> notice,
> + *this list of conditions and the following disclaimer in the
> documentation
> + *and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> "AS IS"
> + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
> THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
> PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
> BE
> + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
> + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> BUSINESS
> + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
> + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
> + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
> THE
> + * POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +.variant_cc ff_h264_loop_filter_luma_8_rvv
> +func ff_h264_loop_filter_luma_8_rvv, zve32x
> +# p2: v8, p1: v9, p0: v10, q0: v11, q1: v12, q2: v13
> +# alpha: a2, beta: a3
> +csrwivxrm, 0
> +vid.vv0
> +vaaddu.vvv14, v10, v11 # (p0 + q0 + 1) / 2
> +vsrl.vi 

Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg

2024-07-01 Thread flow gg
> I am not sure what is_w means or serves here. If you need special cases,
this
> feels a bit out of place for this macro.

It is a special case added to merge the vset of avg and w_avg, how about
giving it a default value so that it doesn't affect the use of other
functions?

> I am not sure if I get it, but it seems like this could be a normal vector
> processing loop without specialisation for the vector length, at (almost)
no
> performance cost.
>
> Can we use a regular loop here instead of repeating the same code?

Okay, updated it

> t0 is a link register, so the branch predictor will treat this a return,
but
> it seems to be a tail call instead.

Will this cause any issues? It will execute at a label, and after
executing, there is a ret at the label.

> For named labels, it is preferable to use func (perhaps with
.variant_cc), to
> get all properties right.

This macro is used in func. I assume it already has the properties of func?

> Could t4 be added in 16-bit mode so we don't need to switch vtype?
> (Also same below)

No, it is 32-bit :(

 于2024年7月1日周一 21:39写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32

Re: [FFmpeg-devel] [PATCH v5] lavc/vvc_mc: R-V V avg w_avg

2024-07-01 Thread flow gg
I reviewed it again, the purpose of is_w is to limit lmul to a maximum of
1/4 of vlen, to prevent vector register shortage, which can also be
considered as vset limiting lmul. I renamed it to quarter_len_limit.

t0 is changed to t1.

 于2024年7月2日周二 00:07写道:

> From: sunyuechi 
>
>   C908   X60
> avg_8_2x2_c:1.21.0
> avg_8_2x2_rvv_i32  :1.01.0
> avg_8_2x4_c:2.02.0
> avg_8_2x4_rvv_i32  :1.51.2
> avg_8_2x8_c:3.74.0
> avg_8_2x8_rvv_i32  :2.02.0
> avg_8_2x16_c   :7.27.7
> avg_8_2x16_rvv_i32 :3.23.0
> avg_8_2x32_c   :   14.5   15.2
> avg_8_2x32_rvv_i32 :5.75.0
> avg_8_2x64_c   :   50.0   45.2
> avg_8_2x64_rvv_i32 :   41.5   32.5
> avg_8_2x128_c  :  101.5   84.2
> avg_8_2x128_rvv_i32:   89.5   73.2
> avg_8_4x2_c:2.02.0
> avg_8_4x2_rvv_i32  :1.01.0
> avg_8_4x4_c:3.53.5
> avg_8_4x4_rvv_i32  :1.51.2
> avg_8_4x8_c:6.77.0
> avg_8_4x8_rvv_i32  :2.01.7
> avg_8_4x16_c   :   13.2   14.0
> avg_8_4x16_rvv_i32 :3.23.0
> avg_8_4x32_c   :   26.2   27.7
> avg_8_4x32_rvv_i32 :5.75.0
> avg_8_4x64_c   :   75.0   66.0
> avg_8_4x64_rvv_i32 :   40.2   33.0
> avg_8_4x128_c  :  144.5  128.0
> avg_8_4x128_rvv_i32:   89.5   78.7
> avg_8_8x2_c:3.23.5
> avg_8_8x2_rvv_i32  :1.21.0
> avg_8_8x4_c:6.56.7
> avg_8_8x4_rvv_i32  :1.51.5
> avg_8_8x8_c:   12.7   13.2
> avg_8_8x8_rvv_i32  :2.21.7
> avg_8_8x16_c   :   25.2   26.5
> avg_8_8x16_rvv_i32 :3.72.7
> avg_8_8x32_c   :   50.2   52.7
> avg_8_8x32_rvv_i32 :6.55.0
> avg_8_8x64_c   :  120.2  117.7
> avg_8_8x64_rvv_i32 :   45.2   39.2
> avg_8_8x128_c  :  223.0  233.5
> avg_8_8x128_rvv_i32:   80.0   73.2
> avg_8_16x2_c   :6.26.5
> avg_8_16x2_rvv_i32 :1.51.0
> avg_8_16x4_c   :   12.5   12.7
> avg_8_16x4_rvv_i32 :2.01.2
> avg_8_16x8_c   :   24.7   26.0
> avg_8_16x8_rvv_i32 :3.22.0
> avg_8_16x16_c  :   49.0   51.2
> avg_8_16x16_rvv_i32:5.73.2
> avg_8_16x32_c  :   97.7  102.5
> avg_8_16x32_rvv_i32:   10.75.7
> avg_8_16x64_c  :  220.5  214.2
> avg_8_16x64_rvv_i32:   48.2   39.5
> avg_8_16x128_c :  436.2  428.0
> avg_8_16x128_rvv_i32   :   97.2   77.0
> avg_8_32x2_c   :   12.2   12.7
> avg_8_32x2_rvv_i32 :2.01.2
> avg_8_32x4_c   :   24.5   25.5
> avg_8_32x4_rvv_i32 :3.21.7
> avg_8_32x8_c   :   48.5   50.7
> avg_8_32x8_rvv_i32 :5.72.7
> avg_8_32x16_c  :   96.5  101.2
> avg_8_32x16_rvv_i32:   10.25.0
> avg_8_32x32_c  :  192.5  202.2
> avg_8_32x32_rvv_i32:   20.09.5
> avg_8_32x64_c  :  405.7  404.5
> avg_8_32x64_rvv_i32 

Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add

2023-11-13 Thread flow gg
Sorry for the long delay in responding.

How is the modified patch now?

no longer using register stride(learn from your code) and have switched to
shNadd instead.

(using m4 and m2 as they are slightly faster than m8 and m4)

benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652

Rémi Denis-Courmont  于2023年9月28日周四 21:33写道:

>
>
> Le 28 septembre 2023 08:45:44 GMT+03:00, flow gg  a
> écrit :
> >Okay, I revert the volatile in ff_read_time
> >
> >How about this version?
>
> It's still using register stride which is all but guaranteed to be slow on
> any hardware and should only be used as a last resort.
>
> The code is also missing scheduling for multi-issue and unrolling with the
> group multiplier.
>
> And lastly, while that probably won't change much, there are no reasons to
> use mul here. You can use shNadd like existing code does.
>
>
> >
> >use vls instead vlseg, and use vfmacc
> >
> >The benchmark is sometimes better, sometimes the same
> >
> >fcmul_add_c: 3.5
> >fcmul_add_rvv_f32: 3.5
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.2
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.5
> >fcmul_add_rvv_f32: 4.2
> > - af_afir.fcmul_add [OK]
> >fcmul_add_c: 4.7
> >fcmul_add_rvv_f32: 3.5
> >
> >
> >Rémi Denis-Courmont  于2023年9月28日周四 00:41写道:
> >
> >> Le tiistaina 26. syyskuuta 2023, 12.24.58 EEST flow gg a écrit :
> >> > benchmark:
> >> > fcmul_add_c: 19.7
> >> > fcmul_add_rvv_f32: 6.7
> >>
> >> With optimisations enabled and the benchmarking fix, I get this (on the
> >> same
> >> hardware, I believe):
> >>
> >> fcmul_add_c: 3.5
> >> fcmul_add_rvv_f32: 6.7
> >>
> >> For sure unfortunate design limitations of T-Head C910 are to blame to
> no
> >> small extent. It is not the first occurrence of an RVV optimisation that
> >> turns
> >> out worse than scalar due to those, and I still have honest hopes that
> >> newer
> >> (and conformant) IP would give saner results, but... I also believe that
> >> the
> >> code could be improved regardless.
> >>
> >> --
> >> Rémi Denis-Courmont
> >> http://www.remlab.net/
> >>
> >>
> >>
> >> ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 4199887247d31348385cd864b4efd6f4c02740f2 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

benchmark:
fcmul_add_c: 2179
fcmul_add_rvv_f32: 1652
---
 libavfilter/af_afirdsp.h |  3 ++
 libavfilter/riscv/Makefile   |  2 ++
 libavfilter/riscv/af_afir_init.c | 39 
 libavfilter/riscv/af_afir_rvv.S  | 61 
 4 files changed, 105 insertions(+)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..d2d1e909c1 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
 } AudioFIRDSPContext;
 
 void ff_afir_init_x86(AudioFIRDSPContext *s);
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
 {
@@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 
 #if ARCH_X86
 ff_afir_init_x86(dsp);
+#elif ARCH_RISCV
+ff_afir_init_riscv(dsp);
 #endif
 }
 
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 00..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 00..13df8341e7
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,39 @@
+/*
+ * Copyrigh

Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add

2023-11-15 Thread flow gg
Okay, I have updated these issues in the patch.

Rémi Denis-Courmont  于2023年11月13日周一 23:35写道:

>Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> >  } AudioFIRDSPContext;
> >
> >  void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> >  static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> >  {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> >  #if ARCH_X86
> >  ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > +ff_afir_init_riscv(dsp);
>
> Ditto.
>
> >  #endif
> >  }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 00..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 00..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include 
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > +   ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > +int flags = av_get_cpu_flags();
> > +
> > +if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > +s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 00..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the Licen

Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add

2023-11-15 Thread flow gg
Okay, I have updated these issues in the patch.

Rémi Denis-Courmont  于2023年11月13日周一 23:35写道:

>Hi,
>
> Le maanantaina 13. marraskuuta 2023, 11.43.01 EET flow gg a écrit :
> > Sorry for the long delay in responding.
>
> No problem. Working with T-Head C910 (or C920?) cores is very tedious. I
> gave
> up on that and switched over to Kendryte K230 (based on C908) now.
>
> > How is the modified patch now?
>
> It looks better, but some minute improvements are still possible.
>
> > no longer using register stride(learn from your code) and have switched
> to
> > shNadd instead.
> >
> > (using m4 and m2 as they are slightly faster than m8 and m4)
> >
> > benchmark:
> > fcmul_add_c: 2179
> > fcmul_add_rvv_f32: 1652
>
> > diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
> > index 4208501393..d2d1e909c1 100644
> > --- a/libavfilter/af_afirdsp.h
> > +++ b/libavfilter/af_afirdsp.h
> > @@ -34,6 +34,7 @@ typedef struct AudioFIRDSPContext {
> >  } AudioFIRDSPContext;
> >
> >  void ff_afir_init_x86(AudioFIRDSPContext *s);
> > +void ff_afir_init_riscv(AudioFIRDSPContext *s);
>
> Nit: please stick to alphabetical order like most similar code.
>
> >
> >  static void fcmul_add_c(float *sum, const float *t, const float *c,
> > ptrdiff_t len)
> >  {
> > @@ -76,6 +77,8 @@ static av_unused void ff_afir_init(AudioFIRDSPContext
> > *dsp)
> >
> >  #if ARCH_X86
> >  ff_afir_init_x86(dsp);
> > +#elif ARCH_RISCV
> > +ff_afir_init_riscv(dsp);
>
> Ditto.
>
> >  #endif
> >  }
> >
> > diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
> > new file mode 100644
> > index 00..0b968a9c0d
> > --- /dev/null
> > +++ b/libavfilter/riscv/Makefile
> > @@ -0,0 +1,2 @@
> > +OBJS += riscv/af_afir_init.o
> > +RVV-OBJS += riscv/af_afir_rvv.o
> > diff --git a/libavfilter/riscv/af_afir_init.c
> > b/libavfilter/riscv/af_afir_init.c new file mode 100644
> > index 00..13df8341e7
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_init.c
> > @@ -0,0 +1,39 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include 
> > +
> > +#include "config.h"
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavfilter/af_afirdsp.h"
> > +
> > +void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
> > +   ptrdiff_t len);
> > +
> > +av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
> > +{
> > +#if HAVE_RVV
> > +int flags = av_get_cpu_flags();
> > +
> > +if (flags & AV_CPU_FLAG_RVV_F32)
>
> You need to check for Zba as well here. I doubt that we'll see hardware
> with V
> and without Zba in real life, but for the sake of correctness...
>
> > +s->fcmul_add = ff_fcmul_add_rvv;
> > +#endif
> > +}
> > diff --git a/libavfilter/riscv/af_afir_rvv.S
> > b/libavfilter/riscv/af_afir_rvv.S new file mode 100644
> > index 00..078cac8e7e
> > --- /dev/null
> > +++ b/libavfilter/riscv/af_afir_rvv.S
> > @@ -0,0 +1,61 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the Licen

Re: [FFmpeg-devel] [PATCH] af_afir: RISC-V V fcmul_add

2023-11-15 Thread flow gg
Okay, I have modified them to 64 and added some descriptions.

Rémi Denis-Courmont  于2023年11月15日周三 23:06写道:

> Le keskiviikkona 15. marraskuuta 2023, 10.59.55 EET flow gg a écrit :
> > Okay, I have updated these issues in the patch.
>
> It does not assemble but I can fix it locally. The narrowing shift
> trickery
> require Zve64x, or rather Zve64f in this case.
>
> The performance improvement is much better on newer hardware:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f64: 2399.5
>
> FWIW, VLSEG2E32.V remains slightly worse than with shifting:
> fcmul_add_c: 4891.2
> fcmul_add_rvv_f32: 2877.5
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 6b88fbf9b94c098841197c9fcb467006177ee4c6 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 3 Nov 2023 10:35:53 +0800
Subject: [PATCH] af_afir: RISC-V V fcmul_add

Segmented loads are slow, so here we use unit-strided load and narrowing shifts.

c910:
fcmul_add_c: 2179
fcmul_add_rvv_f64: 1652

c908:
fcmul_add_c: 4891.2
fcmul_add_rvv_f64: 2399.5
---
 libavfilter/af_afirdsp.h |  5 ++-
 libavfilter/riscv/Makefile   |  2 ++
 libavfilter/riscv/af_afir_init.c | 42 
 libavfilter/riscv/af_afir_rvv.S  | 55 
 4 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 libavfilter/riscv/Makefile
 create mode 100644 libavfilter/riscv/af_afir_init.c
 create mode 100644 libavfilter/riscv/af_afir_rvv.S

diff --git a/libavfilter/af_afirdsp.h b/libavfilter/af_afirdsp.h
index 4208501393..827e067a9b 100644
--- a/libavfilter/af_afirdsp.h
+++ b/libavfilter/af_afirdsp.h
@@ -33,6 +33,7 @@ typedef struct AudioFIRDSPContext {
   ptrdiff_t len);
 } AudioFIRDSPContext;
 
+void ff_afir_init_riscv(AudioFIRDSPContext *s);
 void ff_afir_init_x86(AudioFIRDSPContext *s);
 
 static void fcmul_add_c(float *sum, const float *t, const float *c, ptrdiff_t len)
@@ -74,7 +75,9 @@ static av_unused void ff_afir_init(AudioFIRDSPContext *dsp)
 dsp->fcmul_add = fcmul_add_c;
 dsp->dcmul_add = dcmul_add_c;
 
-#if ARCH_X86
+#if ARCH_RISCV
+ff_afir_init_riscv(dsp);
+#elif ARCH_X86
 ff_afir_init_x86(dsp);
 #endif
 }
diff --git a/libavfilter/riscv/Makefile b/libavfilter/riscv/Makefile
new file mode 100644
index 00..0b968a9c0d
--- /dev/null
+++ b/libavfilter/riscv/Makefile
@@ -0,0 +1,2 @@
+OBJS += riscv/af_afir_init.o
+RVV-OBJS += riscv/af_afir_rvv.o
diff --git a/libavfilter/riscv/af_afir_init.c b/libavfilter/riscv/af_afir_init.c
new file mode 100644
index 00..52aa18c126
--- /dev/null
+++ b/libavfilter/riscv/af_afir_init.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "config.h"
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavfilter/af_afirdsp.h"
+
+void ff_fcmul_add_rvv(float *sum, const float *t, const float *c,
+   ptrdiff_t len);
+
+av_cold void ff_afir_init_riscv(AudioFIRDSPContext *s)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_F64) {
+if (flags & AV_CPU_FLAG_RVB_ADDR) {
+s->fcmul_add = ff_fcmul_add_rvv;
+}
+}
+#endif
+}
diff --git a/libavfilter/riscv/af_afir_rvv.S b/libavfilter/riscv/af_afir_rvv.S
new file mode 100644
index 00..04ec2e50d8
--- /dev/null
+++ b/libavfilter/riscv/af_afir_rvv.S
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ 

[FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-17 Thread flow gg

From 2785ce57f68dbb2373c951b9432afa73796f7cc1 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 18 Nov 2023 10:58:17 +0800
Subject: [PATCH] checkasm: test for dcmul_add

---
 tests/checkasm/af_afir.c | 141 +++
 1 file changed, 98 insertions(+), 43 deletions(-)

diff --git a/tests/checkasm/af_afir.c b/tests/checkasm/af_afir.c
index 08c55dacfc..6cb59dbb6c 100644
--- a/tests/checkasm/af_afir.c
+++ b/tests/checkasm/af_afir.c
@@ -33,64 +33,119 @@ do {  \
 int i;\
 double bmg[2], stddev = 10.0, mean = 0.0; \
   \
-for (i = 0; i < LEN*2+8; i += 2) {\
+for (i = 0; i < BUF_SIZE; i += 2) {\
 av_bmg_get(&checkasm_lfg, bmg);   \
 buf[i] = bmg[0] * stddev + mean;  \
 buf[i + 1] = bmg[1] * stddev + mean;  \
 } \
 } while(0);
 
-static void test_fcmul_add(const float *src0, const float *src1, const float *src2)
+static void test_fcmul_add(AudioFIRDSPContext *fir)
 {
-LOCAL_ALIGNED_32(float, cdst, [LEN*2+8]);
-LOCAL_ALIGNED_32(float, odst, [LEN*2+8]);
-int i;
-
-declare_func(void, float *sum, const float *t, const float *c,
- ptrdiff_t len);
-
-memcpy(cdst, src0, (LEN*2+8) * sizeof(float));
-memcpy(odst, src0, (LEN*2+8) * sizeof(float));
-call_ref(cdst, src1, src2, LEN);
-call_new(odst, src1, src2, LEN);
-for (i = 0; i <= LEN*2; i++) {
-int idx = i & ~1;
-float cre = src2[idx];
-float cim = src2[idx + 1];
-float tre = src1[idx];
-float tim = src1[idx + 1];
-double t = fabs(src0[i]) +
-   fabs(tre) + fabs(tim) + fabs(cre) + fabs(cim) +
-   fabs(tre * cre) + fabs(tim * cim) +
-   fabs(tre * cim) + fabs(tim * cre) +
-   fabs(tre * cre - tim * cim) +
-   fabs(tre * cim + tim * cre) +
-   fabs(cdst[i]) + 1.0;
-if (!float_near_abs_eps(cdst[i], odst[i], t * 2 * FLT_EPSILON)) {
-fprintf(stderr, "%d: %- .12f - %- .12f = % .12g\n",
-i, cdst[i], odst[i], cdst[i] - odst[i]);
-fail();
-break;
+#define BUF_SIZE LEN*2+8
+LOCAL_ALIGNED_32(float, src0, [BUF_SIZE]);
+LOCAL_ALIGNED_32(float, src1, [BUF_SIZE]);
+LOCAL_ALIGNED_32(float, src2, [BUF_SIZE]);
+
+randomize_buffer(src0);
+randomize_buffer(src1);
+randomize_buffer(src2);
+
+if (check_func(fir->fcmul_add, "fcmul_add")) {
+LOCAL_ALIGNED_32(float, cdst, [BUF_SIZE]);
+LOCAL_ALIGNED_32(float, odst, [BUF_SIZE]);
+int i;
+
+declare_func(void, float *sum, const float *t, const float *c,
+ ptrdiff_t len);
+
+memcpy(cdst, src0, (BUF_SIZE) * sizeof(float));
+memcpy(odst, src0, (BUF_SIZE) * sizeof(float));
+call_ref(cdst, src1, src2, LEN);
+call_new(odst, src1, src2, LEN);
+for (i = 0; i <= LEN*2; i++) {
+int idx = i & ~1;
+float cre = src2[idx];
+float cim = src2[idx + 1];
+float tre = src1[idx];
+float tim = src1[idx + 1];
+double t = fabs(src0[i]) +
+   fabs(tre) + fabs(tim) + fabs(cre) + fabs(cim) +
+   fabs(tre * cre) + fabs(tim * cim) +
+   fabs(tre * cim) + fabs(tim * cre) +
+   fabs(tre * cre - tim * cim) +
+   fabs(tre * cim + tim * cre) +
+   fabs(cdst[i]) + 1.0;
+if (!float_near_abs_eps(cdst[i], odst[i], t * 2 * FLT_EPSILON)) {
+fprintf(stderr, "%d: %- .12f - %- .12f = % .12g\n",
+i, cdst[i], odst[i], cdst[i] - odst[i]);
+fail();
+break;
+}
 }
+memcpy(odst, src0, (BUF_SIZE) * sizeof(float));
+bench_new(odst, src1, src2, LEN);
 }
-memcpy(odst, src0, (LEN*2+8) * sizeof(float));
-bench_new(odst, src1, src2, LEN);
+
+report("fcmul_add");
 }
 
-void checkasm_check_afir(void)
+static void test_dcmul_add(AudioFIRDSPContext *fir)
 {
-LOCAL_ALIGNED_32(float, src0, [LEN*2+8]);
-LOCAL_ALIGNED_32(float, src1, [LEN*2+8]);
-LOCAL_ALIGNED_32(float, src2, [LEN*2+8]);
-AudioFIRDSPContext fir = { 0 };
-
-ff_afir_init(&fir);
+#define BUF_SIZE LEN*2+8
+LOCAL_ALIGNED_32(double, src0, [BUF_SIZE]);
+LOCAL_ALIGNED_32(double, src1, [BUF_SIZE]);
+LOCAL_ALIGNED_32(double, src2, [BUF_SIZE]);
 
 randomize_buffer(src0);
 randomize_buffer(src1);
 randomize_buffer(src2);
 
-if (check_func(fir.fcmul_add, "fcmul_add"))
-test_fcmul_add(src0, src1, src2);
-report("fcmul_add");
+if (check_func(fir->dcmul_add, "dcmul_add")) {
+LOCAL_ALIGNED_32(double, cdst, [BUF_SIZE]);
+LOCAL_ALIGNED_

Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-18 Thread flow gg
"dcmul_add")) {
+LOCAL_ALIGNED_32(double, cdst, [BUF_SIZE]);
+LOCAL_ALIGNED_32(double, odst, [BUF_SIZE]);
+int i;
+
+declare_func(void, double *sum, const double *t, const double *c,
+ ptrdiff_t len);
+
+memcpy(cdst, src0, (BUF_SIZE) * sizeof(double));
+memcpy(odst, src0, (BUF_SIZE) * sizeof(double));
+call_ref(cdst, src1, src2, LEN);
+call_new(odst, src1, src2, LEN);
+for (i = 0; i <= LEN*2; i++) {
+int idx = i & ~1;
+double cre = src2[idx];
+double cim = src2[idx + 1];
+double tre = src1[idx];
+double tim = src1[idx + 1];
+double t = fabs(src0[i]) +
+   fabs(tre) + fabs(tim) + fabs(cre) + fabs(cim) +
+   fabs(tre * cre) + fabs(tim * cim) +
+   fabs(tre * cim) + fabs(tim * cre) +
+   fabs(tre * cre - tim * cim) +
+   fabs(tre * cim + tim * cre) +
+   fabs(cdst[i]) + 1.0;
+if (!double_near_abs_eps(cdst[i], odst[i], t * 2 *
FLT_EPSILON)) {
+fprintf(stderr, "%d: %- .12f - %- .12f = % .12g\n",
+i, cdst[i], odst[i], cdst[i] - odst[i]);
+fail();
+break;
+}
+}
+memcpy(odst, src0, (BUF_SIZE) * sizeof(double));
+bench_new(odst, src1, src2, LEN);
+}
+
+report("dcmul_add");
+}
+
+
+void checkasm_check_afir(void)
+{
+    AudioFIRDSPContext fir = { 0 };
+
+ff_afir_init(&fir);
+test_fcmul_add(&fir);
+test_dcmul_add(&fir);
 }
-- 
2.42.1


flow gg  于2023年11月18日周六 11:21写道:

>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
c910
float_to_fixed24_c: 208.2
float_to_fixed24_rvv_f32: 71.5
From 69da974fd0febaa74db4dd551b05172caeefb846 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 208.2
float_to_fixed24_rvv_f32: 71.5
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  5 +
 libavcodec/riscv/ac3dsp_rvv.S  | 40 ++
 3 files changed, 47 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..de82d1c7a7 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -35,4 +36,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
 }
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+}
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..4d8ab060e7
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m4, ta, ma
+vle32.v   v0, (a1)
+slli  t3, t0, 2
+vfmul.vf  v0, v0, f0
+vfcvt.x.f.v   v0, v0
+add   a1, a1, t3
+vse32.v   v0, (a0)
+add   a0, a0, t3
+sub   a2, a2, t0
+bgtz  a2, 1b
+
+ret
+endfunc
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
> How did you test it?

I wrote a test, but it was a bit rough, so I want to modify it before
submitting. I've added it to this reply.

> This does not seem according to the C ABI. AFAIK `unsigned` is
sign-extended.

I'm a bit confused... because this passed in the tests I wrote in qemu.
Maybe there's a problem with my test?

> ALU right before dependent conditional branch should be avoided.

Should the sub be moved forward? I've modified it.

> SHxADD can be used advantageously.

Okay, I've made the modification

Rémi Denis-Courmont  于2023年11月22日周三 21:41写道:

> Hi,
>
> How did you test it? As per
> http://ffmpeg.org/pipermail/ffmpeg-devel/2023-June/310720.html we still
> don't have a FATE instance set up with the RISC-V Vector extension. The
> only testing consists of my manual runs of checkasm on a K230 board. (We
> *do* have Zba and Zbb now though, hence the existing extract_exponents()).
>
> Also:
> - This does not seem according to the C ABI. AFAIK `unsigned` is
> sign-extended.
> - ALU right before dependent conditional branch should be avoided.
> - SHxADD can be used advantageously.
>
>
> Le 22 novembre 2023 14:00:07 GMT+02:00, flow gg  a
> écrit :
> >c910
> >float_to_fixed24_c: 208.2
> >float_to_fixed24_rvv_f32: 71.5
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 3e790fdccd780257f464aa8f8a56a37321ddd429 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 208.2
float_to_fixed24_rvv_f32: 71.5
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  5 +
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..de82d1c7a7 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -35,4 +36,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
 }
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+}
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..c0e2880e28
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m4, ta, ma
+sub   a2, a2, t0
+vle32.v   v0, (a1)
+vfmu

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
qemu-riscv64 -cpu rv64,v=true,g=true,c=true,zba=true,vlen=128 checkasm
--test=ac3dsp

flow gg  于2023年11月22日周三 22:30写道:

> > How did you test it?
>
> I wrote a test, but it was a bit rough, so I want to modify it before
> submitting. I've added it to this reply.
>
> > This does not seem according to the C ABI. AFAIK `unsigned` is
> sign-extended.
>
> I'm a bit confused... because this passed in the tests I wrote in qemu.
> Maybe there's a problem with my test?
>
> > ALU right before dependent conditional branch should be avoided.
>
> Should the sub be moved forward? I've modified it.
>
> > SHxADD can be used advantageously.
>
> Okay, I've made the modification
>
> Rémi Denis-Courmont  于2023年11月22日周三 21:41写道:
>
>> Hi,
>>
>> How did you test it? As per
>> http://ffmpeg.org/pipermail/ffmpeg-devel/2023-June/310720.html we still
>> don't have a FATE instance set up with the RISC-V Vector extension. The
>> only testing consists of my manual runs of checkasm on a K230 board. (We
>> *do* have Zba and Zbb now though, hence the existing extract_exponents()).
>>
>> Also:
>> - This does not seem according to the C ABI. AFAIK `unsigned` is
>> sign-extended.
>> - ALU right before dependent conditional branch should be avoided.
>> - SHxADD can be used advantageously.
>>
>>
>> Le 22 novembre 2023 14:00:07 GMT+02:00, flow gg  a
>> écrit :
>> >c910
>> >float_to_fixed24_c: 208.2
>> >float_to_fixed24_rvv_f32: 71.5
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
Thank you for your guidance, I finally understand..  How about choosing
manual zero-extension for rv64? I modified the patch.

#if (__riscv_xlen == 64)
slli a2, a2, 32
srli a2, a2, 32
#endif

Rémi Denis-Courmont  于2023年11月22日周三 22:51写道:

>
>
> Le 22 novembre 2023 16:30:44 GMT+02:00, flow gg  a
> écrit :
> >> How did you test it?
> >
> >I wrote a test, but it was a bit rough, so I want to modify it before
> >submitting. I've added it to this reply.
> >
> >> This does not seem according to the C ABI. AFAIK `unsigned` is
> >sign-extended.
> >
> >I'm a bit confused... because this passed in the tests I wrote in qemu.
> >Maybe there's a problem with my test?
>
> You probably didn't test sizes between 2^31 and 2^32-1. This might not
> even be feasible in QEMU.
>
> Ideally the prototype would use size_t, then the problem wouldn't exist.
>
> >
> >> ALU right before dependent conditional branch should be avoided.
> >
> >Should the sub be moved forward? I've modified it.
> >
> >> SHxADD can be used advantageously.
> >
> >Okay, I've made the modification
> >
> >Rémi Denis-Courmont  于2023年11月22日周三 21:41写道:
> >
> >> Hi,
> >>
> >> How did you test it? As per
> >> http://ffmpeg.org/pipermail/ffmpeg-devel/2023-June/310720.html we still
> >> don't have a FATE instance set up with the RISC-V Vector extension. The
> >> only testing consists of my manual runs of checkasm on a K230 board. (We
> >> *do* have Zba and Zbb now though, hence the existing
> extract_exponents()).
> >>
> >> Also:
> >> - This does not seem according to the C ABI. AFAIK `unsigned` is
> >> sign-extended.
> >> - ALU right before dependent conditional branch should be avoided.
> >> - SHxADD can be used advantageously.
> >>
> >>
> >> Le 22 novembre 2023 14:00:07 GMT+02:00, flow gg 
> a
> >> écrit :
> >> >c910
> >> >float_to_fixed24_c: 208.2
> >> >float_to_fixed24_rvv_f32: 71.5
> >> ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From d709519219138b746ff622b15bb004b27eed7333 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 208.2
float_to_fixed24_rvv_f32: 71.5
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  5 
 libavcodec/riscv/ac3dsp_rvv.S  | 45 ++
 3 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..de82d1c7a7 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -35,4 +36,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
 }
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+}
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
Wow, thank you for reviewing this. I just wanted to see if the function was
working properly. There are so many bugs in the test code ...
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
Hello, I saw the new commit "avcodec/ac3dsp: make len a size_t in
float_to_fixed24."

So I removed the part #if (__riscv_xlen == 64) and restored the patch.
From 3e790fdccd780257f464aa8f8a56a37321ddd429 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 208.2
float_to_fixed24_rvv_f32: 71.5
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  5 +
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..de82d1c7a7 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -35,4 +36,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
 }
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+}
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..c0e2880e28
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m4, ta, ma
+sub   a2, a2, t0
+vle32.v   v0, (a1)
+vfmul.vf  v0, v0, f0
+vfcvt.x.f.v   v0, v0
+sh2adda1, t0, a1
+vse32.v   v0, (a0)
+sh2adda0, t0, a0
+bgtz  a2, 1b
+
+ret
+endfunc
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH] checkasm/ac3dsp: add float_to_fixed24 test

2023-11-22 Thread flow gg

From 02dd534bd602ba3ec79e51070934949a98f780e2 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] checkasm/ac3dsp: add float_to_fixed24 test

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/ac3dsp.c   | 71 +++
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 4 files changed, 76 insertions(+)
 create mode 100644 tests/checkasm/ac3dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8bc241d29b..53742c93ae 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -1,5 +1,6 @@
 # libavcodec tests
 # subsystems
+AVCODECOBJS-$(CONFIG_AC3DSP)+= ac3dsp.o
 AVCODECOBJS-$(CONFIG_AUDIODSP)  += audiodsp.o
 AVCODECOBJS-$(CONFIG_BLOCKDSP)  += blockdsp.o
 AVCODECOBJS-$(CONFIG_BSWAPDSP)  += bswapdsp.o
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
new file mode 100644
index 00..2ccfa4a9d1
--- /dev/null
+++ b/tests/checkasm/ac3dsp.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/ac3dsp.h"
+
+#include "checkasm.h"
+
+#define randomize_float(buf, len)   \
+do {\
+int i;  \
+for (i = 0; i < len; i++) { \
+float f = (float)rnd() / (UINT_MAX >> 5) - 16.0f;   \
+buf[i] = f; \
+}   \
+} while (0)
+
+static void check_float_to_fixed24(AC3DSPContext *c) {
+#define BUF_SIZE 1024
+LOCAL_ALIGNED_32(int32_t, v1, [BUF_SIZE]);
+LOCAL_ALIGNED_32(float, v2, [BUF_SIZE]);
+
+declare_func(void, int32_t *, const float *, unsigned int);
+
+randomize_float(v2, BUF_SIZE);
+
+if (check_func(c->float_to_fixed24, "float_to_fixed24")) {
+LOCAL_ALIGNED_32(int32_t, dst, [BUF_SIZE]);
+LOCAL_ALIGNED_32(int32_t, dst2, [BUF_SIZE]);
+
+call_ref(dst, v2, BUF_SIZE);
+call_new(dst2, v2, BUF_SIZE);
+
+if (memcmp(dst, dst2, sizeof(*dst) * 10) != 0)
+fail();
+
+bench_new(v1, v2, BUF_SIZE);
+}
+
+
+report("float_to_fixed24");
+}
+
+void checkasm_check_ac3dsp(void)
+{
+AC3DSPContext c;
+ff_ac3dsp_init(&c);
+
+check_float_to_fixed24(&c);
+}
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index 708119e7c6..f37c7fad3a 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -78,6 +78,9 @@ static const struct {
 { "aacpsdsp", checkasm_check_aacpsdsp },
 { "sbrdsp",   checkasm_check_sbrdsp },
 #endif
+#if CONFIG_AC3DSP
+{ "ac3dsp", checkasm_check_ac3dsp },
+#endif
 #if CONFIG_ALAC_DECODER
 { "alacdsp", checkasm_check_alacdsp },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index cfea868ff1..a4238b1dfa 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -43,6 +43,7 @@
 #include "libavutil/timer.h"
 
 void checkasm_check_aacpsdsp(void);
+void checkasm_check_ac3dsp(void);
 void checkasm_check_afir(void);
 void checkasm_check_alacdsp(void);
 void checkasm_check_audiodsp(void);
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-22 Thread flow gg
I modified the temporary test and sent it in "[FFmpeg-devel] [PATCH]
checkasm/ac3dsp: add float_to_fixed24 test".

So the test time results have changed, and I updated them in the patch.

c910
  float_to_fixed24_c: 2207.2
  float_to_fixed24_rvv_f32: 696.2

flow gg  于2023年11月22日周三 20:00写道:

> c910
> float_to_fixed24_c: 208.2
> float_to_fixed24_rvv_f32: 71.5
>
From 3e790fdccd780257f464aa8f8a56a37321ddd429 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
  float_to_fixed24_c: 2207.2
  float_to_fixed24_rvv_f32: 696.2
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  5 +
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..de82d1c7a7 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -35,4 +36,8 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
 }
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
+}
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..c0e2880e28
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m4, ta, ma
+sub   a2, a2, t0
+vle32.v   v0, (a1)
+vfmul.vf  v0, v0, f0
+vfcvt.x.f.v   v0, v0
+sh2adda1, t0, a1
+vse32.v   v0, (a0)
+sh2adda0, t0, a0
+bgtz  a2, 1b
+
+ret
+endfunc
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-11-23 Thread flow gg
Okay, changed

Rémi Denis-Courmont  于2023年11月24日周五 01:09写道:

> Le torstaina 23. marraskuuta 2023, 1.17.03 EET flow gg a écrit :
> > Hello, I saw the new commit "avcodec/ac3dsp: make len a size_t in
> > float_to_fixed24."
> >
> > So I removed the part #if (__riscv_xlen == 64) and restored the patch.
>
> You're not checking for Zba. Also 'bnez'  would be more logical than
> 'bgtz'
> for an unsigned counter.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From af221d659ebc1e97b6d274681061fa8331d0b147 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 2207.2
float_to_fixed24_rvv_f32: 696.2
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  3 +++
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 8f2a519827..ac7b7c2929 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..25244943cb 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -26,6 +26,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, unsigned int len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -34,5 +35,7 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_ADDR) {
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
+if (flags & AV_CPU_FLAG_RVV_F32)
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
 }
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..82c14ea275
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m4, ta, ma
+sub   a2, a2, t0
+vle32.v   v0, (a1)
+vfmul.vf  v0, v0, f0
+vfcvt.x.f.v   v0, v0
+sh2adda1, t0, a1
+vse32.v   v0, (a0)
+sh2adda0, t0, a0
+bnez  a2, 1b
+
+ret
+endfunc
-- 
2.43.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] checkasm/ac3dsp: add float_to_fixed24 test

2023-11-23 Thread flow gg
> You should probably add the test case to tests/fate/checkasm.mak

> This one is not necessary. You can reuse dst or dst2 for the bench() as
it's write only.

> Changed BUF_SIZE instead of 10.

Okay, changed.

James Almer  于2023年11月24日周五 01:11写道:

> On 11/23/2023 4:08 AM, flow gg wrote:
> > +static void check_float_to_fixed24(AC3DSPContext *c) {
> > +#define BUF_SIZE 1024
> > +LOCAL_ALIGNED_32(int32_t, v1, [BUF_SIZE]);
>
> This one is not necessary. You can reuse dst or dst2 for the bench() as
> it's write only.
>
> > +LOCAL_ALIGNED_32(float, v2, [BUF_SIZE]);
> > +
> > +declare_func(void, int32_t *, const float *, unsigned int);
> > +
> > +randomize_float(v2, BUF_SIZE);
> > +
> > +if (check_func(c->float_to_fixed24, "float_to_fixed24")) {
> > +LOCAL_ALIGNED_32(int32_t, dst, [BUF_SIZE]);
> > +LOCAL_ALIGNED_32(int32_t, dst2, [BUF_SIZE]);
> > +
> > +call_ref(dst, v2, BUF_SIZE);
> > +call_new(dst2, v2, BUF_SIZE);
> > +
> > +if (memcmp(dst, dst2, sizeof(*dst) * 10) != 0)
>
> BUF_SIZE instead of 10.
>
> > +fail();
> > +
> > +bench_new(v1, v2, BUF_SIZE);
> > +}
> > +
> > +
> > +report("float_to_fixed24");
> > +}
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From d5bfbecdd32dda0839387d470fee72b6155f084d Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] checkasm/ac3dsp: add float_to_fixed24 test

---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/ac3dsp.c   | 70 +++
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 76 insertions(+)
 create mode 100644 tests/checkasm/ac3dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8bc241d29b..53742c93ae 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -1,5 +1,6 @@
 # libavcodec tests
 # subsystems
+AVCODECOBJS-$(CONFIG_AC3DSP)+= ac3dsp.o
 AVCODECOBJS-$(CONFIG_AUDIODSP)  += audiodsp.o
 AVCODECOBJS-$(CONFIG_BLOCKDSP)  += blockdsp.o
 AVCODECOBJS-$(CONFIG_BSWAPDSP)  += bswapdsp.o
diff --git a/tests/checkasm/ac3dsp.c b/tests/checkasm/ac3dsp.c
new file mode 100644
index 00..8f36f1736c
--- /dev/null
+++ b/tests/checkasm/ac3dsp.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/ac3dsp.h"
+
+#include "checkasm.h"
+
+#define randomize_float(buf, len)   \
+do {\
+int i;  \
+for (i = 0; i < len; i++) { \
+float f = (float)rnd() / (UINT_MAX >> 5) - 16.0f;   \
+buf[i] = f; \
+}   \
+} while (0)
+
+static void check_float_to_fixed24(AC3DSPContext *c) {
+#define BUF_SIZE 1024
+LOCAL_ALIGNED_32(float, src, [BUF_SIZE]);
+
+declare_func(void, int32_t *, const float *, unsigned int);
+
+randomize_float(src, BUF_SIZE);
+
+if (check_func(c->float_to_fixed24, "float_to_fixed24")) {
+LOCAL_ALIGNED_32(int32_t, dst, [BUF_SIZE]);
+LOCAL_ALIGNED_32(int32_t, dst2, [BUF_SIZE]);
+
+call_ref(dst, src, BUF_SIZE);
+call_new(dst2, src, BUF_SIZE);
+
+if (memcmp(dst, dst2, BUF_SIZE) != 0)
+fail();
+
+bench_new(dst, src, BUF_SIZE);
+}
+
+
+report("float_to_fixed24&q

Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-26 Thread flow gg
This is a bit confusing for me.. I tried pulling the latest code, and then
used `git am checkasm-test-for-dcmul_add.patch` without any patch
corruption.

Rémi Denis-Courmont  于2023年11月27日周一 03:36写道:

> Le sunnuntaina 19. marraskuuta 2023, 0.28.10 EET flow gg a écrit :
> > From 2785ce57f68dbb2373c951b9432afa73796f7cc1 Mon Sep 17 00:00:00 2001
> > From: sunyuechi 
> > Date: Sat, 18 Nov 2023 10:58:17 +0800
> > Subject: [PATCH] checkasm: test for dcmul_add
>
> git-am reports the patch corrupt.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] checkasm: add test for dcmul_add

2023-11-27 Thread flow gg
In this email, I first attached the original patch, and then sent the
content of this patch in the second reply.
So I tried downloading the attached original patch and running git am
without any issues.
then I tried copying the content from the second reply into the patch and
then running git am also posed no problems.
(I am using the Gmail web page.)

Rémi Denis-Courmont  于2023年11月27日周一 20:17写道:

>
>
> Le 26 novembre 2023 22:54:28 GMT+02:00, flow gg  a
> écrit :
> >This is a bit confusing for me.. I tried pulling the latest code, and then
> >used `git am checkasm-test-for-dcmul_add.patch` without any patch
> >corruption.
>
> Did you try with the actual sent email or only with the original patch
> file? ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >>
> >___
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-11-28 Thread flow gg

From 85e60d75554894964825f5718d14591294ec4e88 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 28 Nov 2023 14:08:12 +0800
Subject: [PATCH 1/2] checkasm: test for abs_pow34

---
 libavcodec/aacenc.c| 24 +++--
 libavcodec/aacenc.h|  1 +
 tests/checkasm/Makefile|  1 +
 tests/checkasm/aacencdsp.c | 70 ++
 tests/checkasm/checkasm.c  |  3 ++
 tests/checkasm/checkasm.h  |  1 +
 tests/fate/checkasm.mak|  3 +-
 7 files changed, 92 insertions(+), 11 deletions(-)
 create mode 100644 tests/checkasm/aacencdsp.c

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 5e6a255a8f..443b25e25a 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -1381,16 +1381,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
 ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
 s->random_state = 0x1f2e3d4c;
 
-s->abs_pow34   = abs_pow34_v;
-s->quant_bands = quantize_bands;
-
-#if ARCH_X86
-ff_aac_dsp_init_x86(s);
-#endif
-
-#if HAVE_MIPSDSP
-ff_aac_coder_init_mips(s);
-#endif
+		ff_aac_dsp_init(s);
 
 ff_af_queue_init(avctx, &s->afq);
 
@@ -1444,3 +1435,16 @@ const FFCodec ff_aac_encoder = {
  AV_SAMPLE_FMT_NONE },
 .p.priv_class   = &aacenc_class,
 };
+
+void ff_aac_dsp_init(AACEncContext *s){
+s->abs_pow34   = abs_pow34_v;
+s->quant_bands = quantize_bands;
+
+#if ARCH_X86
+ff_aac_dsp_init_x86(s);
+#endif
+
+#if HAVE_MIPSDSP
+ff_aac_coder_init_mips(s);
+#endif
+}
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index b030c652ae..09dd8639be 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -154,6 +154,7 @@ typedef struct AACEncContext {
 } buffer;
 } AACEncContext;
 
+void ff_aac_dsp_init(AACEncContext *s);
 void ff_aac_dsp_init_x86(AACEncContext *s);
 void ff_aac_coder_init_mips(AACEncContext *c);
 void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8bc241d29b..da209dd7ad 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -22,6 +22,7 @@ AVCODECOBJS-$(CONFIG_VIDEODSP)  += videodsp.o
 # decoders/encoders
 AVCODECOBJS-$(CONFIG_AAC_DECODER)   += aacpsdsp.o \
sbrdsp.o
+AVCODECOBJS-$(CONFIG_AAC_ENCODER)   += aacencdsp.o
 AVCODECOBJS-$(CONFIG_ALAC_DECODER)  += alacdsp.o
 AVCODECOBJS-$(CONFIG_DCA_DECODER)   += synth_filter.o
 AVCODECOBJS-$(CONFIG_EXR_DECODER)   += exrdsp.o
diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
new file mode 100644
index 00..684c775862
--- /dev/null
+++ b/tests/checkasm/aacencdsp.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/aacenc.h"
+
+#include "checkasm.h"
+
+#define randomize_float(buf, len)   \
+do {\
+int i;  \
+for (i = 0; i < len; i++) { \
+float f = (float)rnd() / (UINT_MAX >> 5) - 16.0f;   \
+buf[i] = f; \
+}   \
+} while (0)
+
+static void test_abs_pow34(AACEncContext *s) {
+#define BUF_SIZE 1024
+LOCAL_ALIGNED_32(float, in, [BUF_SIZE]);
+
+declare_func(void, float *, const float *, int);
+
+randomize_float(in, BUF_SIZE);
+
+if (check_func(s->abs_pow34, "abs_pow34")) {
+LOCAL_ALIGNED_32(float, out, [BUF_SIZE]);
+LOCAL_ALIGNED_32(float, out2, [BUF_SIZE]);
+
+call_ref(out, in, BUF_SIZE);
+call_new(out2, in, BUF_SIZE);
+
+if (memcmp(out, out2, BUF_SIZE * sizeof(float)) != 0)
+fail();
+
+bench_new(out, in, BUF_SIZE);
+}
+
+report("abs_pow34");
+}
+
+
+void checkasm_check_aacencdsp(void)
+{
+AACEncContext s = { 0 };
+ff_aac_dsp_init(&s);
+
+test_abs_pow34(&s);
+}
diff --git

[FFmpeg-devel] [PATCH 2/2] lavc/aacencdsp: R-V V abs_pow34

2023-11-28 Thread flow gg
c910:
abs_pow34_c: 24610.7
abs_pow34_rvv_f32: 6177.7

(need use "[FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34" first)
From 86577c2d40d29422c4b769c854df99a88c7b3c77 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 28 Nov 2023 20:14:14 +0800
Subject: [PATCH 2/2] lavc/aacencdsp: R-V V abs_pow34

c910:
abs_pow34_c: 24610.7
abs_pow34_rvv_f32: 6177.7
---
 libavcodec/aacenc.c   |  4 +++
 libavcodec/aacenc.h   |  1 +
 libavcodec/riscv/Makefile |  1 +
 libavcodec/riscv/aacencdsp_init.c | 42 +++
 libavcodec/riscv/aacencdsp_rvv.S  | 38 
 5 files changed, 86 insertions(+)
 create mode 100644 libavcodec/riscv/aacencdsp_init.c
 create mode 100644 libavcodec/riscv/aacencdsp_rvv.S

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 443b25e25a..55c4bf55ce 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -1440,6 +1440,10 @@ void ff_aac_dsp_init(AACEncContext *s){
 s->abs_pow34   = abs_pow34_v;
 s->quant_bands = quantize_bands;
 
+#if ARCH_RISCV
+ff_aac_dsp_init_riscv(s);
+#endif
+
 #if ARCH_X86
 ff_aac_dsp_init_x86(s);
 #endif
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index 09dd8639be..18b424736d 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -155,6 +155,7 @@ typedef struct AACEncContext {
 } AACEncContext;
 
 void ff_aac_dsp_init(AACEncContext *s);
+void ff_aac_dsp_init_riscv(AACEncContext *s);
 void ff_aac_dsp_init_x86(AACEncContext *s);
 void ff_aac_coder_init_mips(AACEncContext *c);
 void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..6028f23b58 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,4 +1,5 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
+OBJS-$(CONFIG_AAC_ENCODER) += riscv/aacencdsp_init.o riscv/aacencdsp_rvv.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
  riscv/ac3dsp_rvb.o
diff --git a/libavcodec/riscv/aacencdsp_init.c b/libavcodec/riscv/aacencdsp_init.c
new file mode 100644
index 00..83ae16f46b
--- /dev/null
+++ b/libavcodec/riscv/aacencdsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_rvv(float *out, const float *in, const int size);
+
+av_cold void ff_aac_dsp_init_riscv(AACEncContext *s)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+if (flags & AV_CPU_FLAG_RVB_ADDR) {
+s->abs_pow34 = ff_abs_pow34_rvv;
+}
+}
+#endif
+}
diff --git a/libavcodec/riscv/aacencdsp_rvv.S b/libavcodec/riscv/aacencdsp_rvv.S
new file mode 100644
index 00..07f9e7228d
--- /dev/null
+++ b/libavcodec/riscv/aacencdsp_rvv.S
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+func ff_abs_pow34_rvv, zve32f
+1:
+vsetvli  t0, a2, e32, m4, ta, ma
+sub  a2, a2, t0
+vle32.v  v0, (a1)
+ 

Re: [FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-11-30 Thread flow gg
Okay,  I splited and attached



Rémi Denis-Courmont  于2023年11月30日周四 23:31写道:

> Le tiistaina 28. marraskuuta 2023, 18.59.38 EET flow gg a écrit :
> >
>
> Since nobody else commented, I shall note that you should probably split
> the
> underlying lavc changes into a separate preliminary patch.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 95716fe7798ef207bb7924dff81970a45c358173 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 04:21:53 +0800
Subject: [PATCH 1/3] lvac/aacenc: add ff_aac_dsp_init

This is for clarity and use in testing, consistent with other parts of the code.
---
 libavcodec/aacenc.c | 24 ++--
 libavcodec/aacenc.h |  1 +
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 5e6a255a8f..443b25e25a 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -1381,16 +1381,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
 ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
 s->random_state = 0x1f2e3d4c;
 
-s->abs_pow34   = abs_pow34_v;
-s->quant_bands = quantize_bands;
-
-#if ARCH_X86
-ff_aac_dsp_init_x86(s);
-#endif
-
-#if HAVE_MIPSDSP
-ff_aac_coder_init_mips(s);
-#endif
+		ff_aac_dsp_init(s);
 
 ff_af_queue_init(avctx, &s->afq);
 
@@ -1444,3 +1435,16 @@ const FFCodec ff_aac_encoder = {
  AV_SAMPLE_FMT_NONE },
 .p.priv_class   = &aacenc_class,
 };
+
+void ff_aac_dsp_init(AACEncContext *s){
+s->abs_pow34   = abs_pow34_v;
+s->quant_bands = quantize_bands;
+
+#if ARCH_X86
+ff_aac_dsp_init_x86(s);
+#endif
+
+#if HAVE_MIPSDSP
+ff_aac_coder_init_mips(s);
+#endif
+}
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index b030c652ae..09dd8639be 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -154,6 +154,7 @@ typedef struct AACEncContext {
 } buffer;
 } AACEncContext;
 
+void ff_aac_dsp_init(AACEncContext *s);
 void ff_aac_dsp_init_x86(AACEncContext *s);
 void ff_aac_coder_init_mips(AACEncContext *c);
 void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
-- 
2.43.0

From 3c5b83a744f86107cdb58ad6e288c027cfa8c3cd Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 28 Nov 2023 14:08:12 +0800
Subject: [PATCH 2/3] checkasm: test for abs_pow34

---
 tests/checkasm/Makefile|  1 +
 tests/checkasm/aacencdsp.c | 70 ++
 tests/checkasm/checkasm.c  |  3 ++
 tests/checkasm/checkasm.h  |  1 +
 tests/fate/checkasm.mak|  3 +-
 5 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/aacencdsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 8bc241d29b..da209dd7ad 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -22,6 +22,7 @@ AVCODECOBJS-$(CONFIG_VIDEODSP)  += videodsp.o
 # decoders/encoders
 AVCODECOBJS-$(CONFIG_AAC_DECODER)   += aacpsdsp.o \
sbrdsp.o
+AVCODECOBJS-$(CONFIG_AAC_ENCODER)   += aacencdsp.o
 AVCODECOBJS-$(CONFIG_ALAC_DECODER)  += alacdsp.o
 AVCODECOBJS-$(CONFIG_DCA_DECODER)   += synth_filter.o
 AVCODECOBJS-$(CONFIG_EXR_DECODER)   += exrdsp.o
diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
new file mode 100644
index 00..684c775862
--- /dev/null
+++ b/tests/checkasm/aacencdsp.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/aacenc.h"
+
+#include "checkasm.h"
+
+#define randomize_float(buf, len)   \
+do {\
+in

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-12-01 Thread flow gg
Okay, changed and attached

Rémi Denis-Courmont  于2023年12月2日周六 02:38写道:

> Le perjantaina 1. joulukuuta 2023, 20.35.10 EET Rémi Denis-Courmont a
> écrit :
> > Le perjantaina 24. marraskuuta 2023, 0.39.39 EET flow gg a écrit :
> > > Okay, changed
> >
> > src/libavcodec/riscv/ac3dsp_init.c: In function ‘ff_ac3dsp_init_riscv’:
> > src/libavcodec/riscv/ac3dsp_init.c:39:33: warning: assignment to ‘void
> (*)
> > (int32_t *, const float *, size_t)’ {aka ‘void (*)(int *, const float *,
> > long unsigned int)’} from incompatible pointer type ‘void (*)(int32_t *,
> > const float *, unsigned int)’ {aka ‘void (*)(int *, const float *,
> unsigned
> > int)’} [- Wincompatible-pointer-types]
> >39 | c->float_to_fixed24 = ff_float_to_fixed24_rvv;
> >
> >   | ^
> >
> > Also the Makefile precondition is inaccurate.
>
> Oh, and on C908, LMUL=8 is actually faster than LMUL=4. Generally
> speaking,
> you should maximise the LMUL unless there is a *specific* reason not to.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From db03232a9ef1caab333b2fb5a1b684a68c7b0114 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 2207.2
float_to_fixed24_rvv_f32: 696.2
---
 libavcodec/riscv/Makefile  |  3 ++-
 libavcodec/riscv/ac3dsp_init.c |  4 
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..b00db279c1 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,7 +1,8 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
- riscv/ac3dsp_rvb.o
+ riscv/ac3dsp_rvb.o \
+ riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..118b2955ca 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include 
 #include 
 
 #include "config.h"
@@ -26,6 +27,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -34,5 +36,7 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_ADDR) {
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
+if (flags & AV_CPU_FLAG_RVV_F32)
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
 }
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..b8d32c4677
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavutil/riscv/asm.S"
+
+func ff_float_to_fixed24_rvv, zve32f
+lit1, 1 << 24
+fcvt.s.w  f0, t1
+1:
+vsetvli   t0, a2, e32, m8, ta, ma
+sub   a2, a2, t0
+  

Re: [FFmpeg-devel] [PATCH] ac3dsp: RISC-V V float_to_fixed24

2023-12-01 Thread flow gg
I forgot to modify the Makefile; I've made the changes in this reply.

flow gg  于2023年12月2日周六 03:50写道:

> Okay, changed and attached
>
> Rémi Denis-Courmont  于2023年12月2日周六 02:38写道:
>
>> Le perjantaina 1. joulukuuta 2023, 20.35.10 EET Rémi Denis-Courmont a
>> écrit :
>> > Le perjantaina 24. marraskuuta 2023, 0.39.39 EET flow gg a écrit :
>> > > Okay, changed
>> >
>> > src/libavcodec/riscv/ac3dsp_init.c: In function ‘ff_ac3dsp_init_riscv’:
>> > src/libavcodec/riscv/ac3dsp_init.c:39:33: warning: assignment to ‘void
>> (*)
>> > (int32_t *, const float *, size_t)’ {aka ‘void (*)(int *, const float *,
>> > long unsigned int)’} from incompatible pointer type ‘void (*)(int32_t *,
>> > const float *, unsigned int)’ {aka ‘void (*)(int *, const float *,
>> unsigned
>> > int)’} [- Wincompatible-pointer-types]
>> >39 | c->float_to_fixed24 = ff_float_to_fixed24_rvv;
>> >
>> >   | ^
>> >
>> > Also the Makefile precondition is inaccurate.
>>
>> Oh, and on C908, LMUL=8 is actually faster than LMUL=4. Generally
>> speaking,
>> you should maximise the LMUL unless there is a *specific* reason not to.
>>
>> --
>> レミ・デニ-クールモン
>> http://www.remlab.net/
>>
>>
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
From 2aa06d9d8d4853ac089a13ed6a758f9ecb0aa5a9 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 22 Nov 2023 14:57:29 +0800
Subject: [PATCH] lavc/ac3dsp: R-V V float_to_fixed24

c910
float_to_fixed24_c: 2207.2
float_to_fixed24_rvv_f32: 696.2
---
 libavcodec/riscv/Makefile  |  1 +
 libavcodec/riscv/ac3dsp_init.c |  4 
 libavcodec/riscv/ac3dsp_rvv.S  | 39 ++
 3 files changed, 44 insertions(+)
 create mode 100644 libavcodec/riscv/ac3dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..29a7fec455 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -2,6 +2,7 @@ OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o \
  riscv/ac3dsp_rvb.o
+RVV-OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_rvv.o
 OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_init.o
 RVV-OBJS-$(CONFIG_ALAC_DECODER) += riscv/alacdsp_rvv.o
 OBJS-$(CONFIG_AUDIODSP) += riscv/audiodsp_init.o \
diff --git a/libavcodec/riscv/ac3dsp_init.c b/libavcodec/riscv/ac3dsp_init.c
index 20f294f1de..118b2955ca 100644
--- a/libavcodec/riscv/ac3dsp_init.c
+++ b/libavcodec/riscv/ac3dsp_init.c
@@ -18,6 +18,7 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include 
 #include 
 
 #include "config.h"
@@ -26,6 +27,7 @@
 #include "libavcodec/ac3dsp.h"
 
 void ff_extract_exponents_rvb(uint8_t *exp, int32_t *coef, int nb_coefs);
+void ff_float_to_fixed24_rvv(int32_t *dst, const float *src, size_t len);
 
 av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 {
@@ -34,5 +36,7 @@ av_cold void ff_ac3dsp_init_riscv(AC3DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_ADDR) {
 if (flags & AV_CPU_FLAG_RVB_BASIC)
 c->extract_exponents = ff_extract_exponents_rvb;
+if (flags & AV_CPU_FLAG_RVV_F32)
+c->float_to_fixed24 = ff_float_to_fixed24_rvv;
 }
 }
diff --git a/libavcodec/riscv/ac3dsp_rvv.S b/libavcodec/riscv/ac3dsp_rvv.S
new file mode 100644
index 00..b8d32c4677
--- /dev/null
+++ b/libavcodec/riscv/ac3dsp_rvv.S
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+#include "libavu

[FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-03 Thread flow gg
c910
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
From cba93503a6f0753b56c1d0cb00f642b3982ee656 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 10:07:40 +0800
Subject: [PATCH] lavc/vc1dsp: R-V V inv_trans

c910
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
---
 libavcodec/riscv/Makefile  |   2 +
 libavcodec/riscv/vc1dsp_init.c |  47 +
 libavcodec/riscv/vc1dsp_rvv.S  | 123 +
 libavcodec/vc1dsp.c|   2 +
 libavcodec/vc1dsp.h|   1 +
 5 files changed, 175 insertions(+)
 create mode 100644 libavcodec/riscv/vc1dsp_init.c
 create mode 100644 libavcodec/riscv/vc1dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..442c5961ea 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -39,5 +39,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
+OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
new file mode 100644
index 00..88e0434f0e
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vc1.h"
+
+void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_I64) {
+dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
+dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
+}
+if (flags & AV_CPU_FLAG_RVV_I32) {
+dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
+dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
+}
+#endif
+}
diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
new file mode 100644
index 00..8a6b27192a
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_rvv.S
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/r

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-04 Thread flow gg
> Probably missing VLENB checks.

Changed.

> You can multiply by 3, 5 or 9 with shift-and-add. By 12 with shift-and-add
> then shift, and by 17 with shift then add. You don't need multiplications.

Changed.

> Do you really need to splat? Can't .vx or .wx be used instead?

Okay, for example in ff_vc1_inv_trans_8x8_dc_rvv

+ vsetvli  zero, t0, e8, m2, ta, ma
+ vwaddu.vxv4, v0, zero
+ vsetvli  zero, t0, e16, m4, ta, ma
+ vadd.vx  v4, v4, t2
- vsetvli  zero, t0, e16, m4, ta, ma
- vmv.v.x  v4, t2
- vsetvli  zero, t0, e8, m2, ta, ma
- vwaddu.wvv4, v4, v0

But the speed has slowed down slightly on the c910, I'm not sure if I
should modify it.

splat|  no splat
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0   |  84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0 |  76.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.0 |  150.0
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5 |  84.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0|  129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 76.0 |  76.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 255.0  |  254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5 |  93.0

> The code below uses fractional multipliers, so I infer that the
benchmarked
> code was significantly different, and the measurements are not really
worth the
> bother.
>
> I know that supply is a problem at the moment, but I if you are going to
keep
> this up, I would hope that ISCAS can get you access to an RVV 1.0 board.

Using mf2 only requires changing the first vset and the last vset in each
function to mf2.
I guess they would achieve similar effects on both c910 and c908?

example in 8x8
```
- vsetivlizero, 8, e64, m4, ta, ma
+ vsetivlizero, 8, e8, mf2, ta, ma
- vsetvli zero, zero, e64, m4, ta, ma
+ vsetivlizero, 8, e8, mf2, ta, ma
```

And ISCAS seems to have no announcement about getting an RVV 1.0 board. I
plan to ask about it from time to time.

Rémi Denis-Courmont  于2023年12月4日周一 01:17写道:

> Le sunnuntaina 3. joulukuuta 2023, 16.40.08 EET flow gg a écrit :
> > c910
> > vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
> > vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
> > vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
> > vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
> > vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
> > vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
> > vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
> > vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
>
> The code below uses fractional multipliers, so I infer that the
> benchmarked
> code was significantly different, and the measurements are not really
> worth the
> bother.
>
> I know that supply is a problem at the moment, but I if you are going to
> keep
> this up, I would hope that ISCAS can get you access to an RVV 1.0 board.
>
> In-line...
>
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > new file mode 100644
> > index 00..88e0434f0e
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -0,0 +1,47 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include 
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vc1.h"
> > +
> > +void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> 

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-04 Thread flow gg
I found that in the case of nosplat, an additional vset can be removed, and
the time is basically the same, so I updated the patch.

Rémi Denis-Courmont  于2023年12月4日周一 23:15写道:

> Le maanantaina 4. joulukuuta 2023, 10.48.56 EET flow gg a écrit :
> > > Probably missing VLENB checks.
> >
> > Changed.
> >
> > > You can multiply by 3, 5 or 9 with shift-and-add. By 12 with
> shift-and-add
> > > then shift, and by 17 with shift then add. You don't need
> multiplications.
> >
> > Changed.
> >
> > > Do you really need to splat? Can't .vx or .wx be used instead?
> >
> > Okay, for example in ff_vc1_inv_trans_8x8_dc_rvv
> >
> > + vsetvli  zero, t0, e8, m2, ta, ma
> > + vwaddu.vxv4, v0, zero
> > + vsetvli  zero, t0, e16, m4, ta, ma
> > + vadd.vx  v4, v4, t2
> > - vsetvli  zero, t0, e16, m4, ta, ma
> > - vmv.v.x  v4, t2
> > - vsetvli  zero, t0, e8, m2, ta, ma
> > - vwaddu.wvv4, v4, v0
> >
> > But the speed has slowed down slightly on the c910,
> > I'm not sure if I should modify it.
>
> OK, unfortunately, there is no widening addition with wide scalar operand.
> But
> you can do zero-extension then addition here. In the end, I doubt that you
> can
> reasonably optimise whilst working with a C910-based board. This function
> deviates too much on non-conformant hardware.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From e4040b48245ee1733bcda4168e3c78a9a9cf82c0 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 10:07:40 +0800
Subject: [PATCH] lavc/vc1dsp: R-V V inv_trans

c910
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
---
 libavcodec/riscv/Makefile  |   2 +
 libavcodec/riscv/vc1dsp_init.c |  47 +
 libavcodec/riscv/vc1dsp_rvv.S  | 118 +
 libavcodec/vc1dsp.c|   2 +
 libavcodec/vc1dsp.h|   1 +
 5 files changed, 170 insertions(+)
 create mode 100644 libavcodec/riscv/vc1dsp_init.c
 create mode 100644 libavcodec/riscv/vc1dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..442c5961ea 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -39,5 +39,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
+OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
new file mode 100644
index 00..6a04bea6d6
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vc1.h"
+
+void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int

Re: [FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-12-04 Thread flow gg
Because there was a conflict, the patch was updated in the reply

flow gg  于2023年12月1日周五 04:25写道:

> Okay,  I splited and attached
>
>
>
> Rémi Denis-Courmont  于2023年11月30日周四 23:31写道:
>
>> Le tiistaina 28. marraskuuta 2023, 18.59.38 EET flow gg a écrit :
>> >
>>
>> Since nobody else commented, I shall note that you should probably split
>> the
>> underlying lavc changes into a separate preliminary patch.
>>
>> --
>> レミ・デニ-クールモン
>> http://www.remlab.net/
>>
>>
>>
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
From 278632c349dad4f5f551e7565511d5095b607d06 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 04:21:53 +0800
Subject: [PATCH 1/2] lvac/aacenc: add ff_aac_dsp_init

This is for clarity and use in testing, consistent with other parts of the code.
---
 libavcodec/aacenc.c | 24 ++--
 libavcodec/aacenc.h |  1 +
 2 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 5e6a255a8f..443b25e25a 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -1381,16 +1381,7 @@ static av_cold int aac_encode_init(AVCodecContext *avctx)
 ff_lpc_init(&s->lpc, 2*avctx->frame_size, TNS_MAX_ORDER, FF_LPC_TYPE_LEVINSON);
 s->random_state = 0x1f2e3d4c;
 
-s->abs_pow34   = abs_pow34_v;
-s->quant_bands = quantize_bands;
-
-#if ARCH_X86
-ff_aac_dsp_init_x86(s);
-#endif
-
-#if HAVE_MIPSDSP
-ff_aac_coder_init_mips(s);
-#endif
+		ff_aac_dsp_init(s);
 
 ff_af_queue_init(avctx, &s->afq);
 
@@ -1444,3 +1435,16 @@ const FFCodec ff_aac_encoder = {
  AV_SAMPLE_FMT_NONE },
 .p.priv_class   = &aacenc_class,
 };
+
+void ff_aac_dsp_init(AACEncContext *s){
+s->abs_pow34   = abs_pow34_v;
+s->quant_bands = quantize_bands;
+
+#if ARCH_X86
+ff_aac_dsp_init_x86(s);
+#endif
+
+#if HAVE_MIPSDSP
+ff_aac_coder_init_mips(s);
+#endif
+}
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index b030c652ae..09dd8639be 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -154,6 +154,7 @@ typedef struct AACEncContext {
 } buffer;
 } AACEncContext;
 
+void ff_aac_dsp_init(AACEncContext *s);
 void ff_aac_dsp_init_x86(AACEncContext *s);
 void ff_aac_coder_init_mips(AACEncContext *c);
 void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
-- 
2.43.0

From 0ce3bcfee65eab85d899ee734ab75c4202aa03ac Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 28 Nov 2023 14:08:12 +0800
Subject: [PATCH 2/2] checkasm: test for abs_pow34

---
 tests/checkasm/Makefile|  1 +
 tests/checkasm/aacencdsp.c | 70 ++
 tests/checkasm/checkasm.c  |  3 ++
 tests/checkasm/checkasm.h  |  1 +
 tests/fate/checkasm.mak|  3 +-
 5 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/checkasm/aacencdsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index 53742c93ae..05f07ca560 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -23,6 +23,7 @@ AVCODECOBJS-$(CONFIG_VIDEODSP)  += videodsp.o
 # decoders/encoders
 AVCODECOBJS-$(CONFIG_AAC_DECODER)   += aacpsdsp.o \
sbrdsp.o
+AVCODECOBJS-$(CONFIG_AAC_ENCODER)   += aacencdsp.o
 AVCODECOBJS-$(CONFIG_ALAC_DECODER)  += alacdsp.o
 AVCODECOBJS-$(CONFIG_DCA_DECODER)   += synth_filter.o
 AVCODECOBJS-$(CONFIG_EXR_DECODER)   += exrdsp.o
diff --git a/tests/checkasm/aacencdsp.c b/tests/checkasm/aacencdsp.c
new file mode 100644
index 00..684c775862
--- /dev/null
+++ b/tests/checkasm/aacencdsp.c
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+
+#include "libavutil/mem.h"
+#include "libavutil/mem_internal.h

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-04 Thread flow gg
Okay, after using zext, can delete two vset, which is better than splat. I
have updated the patch in this reply.

Rémi Denis-Courmont  于2023年12月4日周一 23:15写道:

> Le maanantaina 4. joulukuuta 2023, 10.48.56 EET flow gg a écrit :
> > > Probably missing VLENB checks.
> >
> > Changed.
> >
> > > You can multiply by 3, 5 or 9 with shift-and-add. By 12 with
> shift-and-add
> > > then shift, and by 17 with shift then add. You don't need
> multiplications.
> >
> > Changed.
> >
> > > Do you really need to splat? Can't .vx or .wx be used instead?
> >
> > Okay, for example in ff_vc1_inv_trans_8x8_dc_rvv
> >
> > + vsetvli  zero, t0, e8, m2, ta, ma
> > + vwaddu.vxv4, v0, zero
> > + vsetvli  zero, t0, e16, m4, ta, ma
> > + vadd.vx  v4, v4, t2
> > - vsetvli  zero, t0, e16, m4, ta, ma
> > - vmv.v.x  v4, t2
> > - vsetvli  zero, t0, e8, m2, ta, ma
> > - vwaddu.wvv4, v4, v0
> >
> > But the speed has slowed down slightly on the c910,
> > I'm not sure if I should modify it.
>
> OK, unfortunately, there is no widening addition with wide scalar operand.
> But
> you can do zero-extension then addition here. In the end, I doubt that you
> can
> reasonably optimise whilst working with a C910-based board. This function
> deviates too much on non-conformant hardware.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 4022465172a5ac0c9669795e318edfa74e9d346e Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 10:07:40 +0800
Subject: [PATCH] lavc/vc1dsp: R-V V inv_trans

c910
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
---
 libavcodec/riscv/Makefile  |   2 +
 libavcodec/riscv/vc1dsp_init.c |  47 ++
 libavcodec/riscv/vc1dsp_rvv.S  | 114 +
 libavcodec/vc1dsp.c|   2 +
 libavcodec/vc1dsp.h|   1 +
 5 files changed, 166 insertions(+)
 create mode 100644 libavcodec/riscv/vc1dsp_init.c
 create mode 100644 libavcodec/riscv/vc1dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..442c5961ea 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -39,5 +39,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
+OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
new file mode 100644
index 00..6a04bea6d6
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vc1.h"
+
+void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-05 Thread flow gg
> This block can be folded into the next. You don't need to check VLENB
twice.

Changed.

> Instruction scheduling could be better, especially on in-order CPUs.

I put the vload at the front, and then proceeded with the t2 operation, but
I'm not sure...

> You don't need to reset the AVL here, just pass zero.

Changed.

> vsetivli

Changed.

Rémi Denis-Courmont  于2023年12月6日周三 00:41写道:

> Hi,
>
> > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> > index 2d0e6c19c8..442c5961ea 100644
> > --- a/libavcodec/riscv/Makefile
> > +++ b/libavcodec/riscv/Makefile
> > @@ -39,5 +39,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o
> \
> >  RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
> >  OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
> >  RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
> > +OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
> > +RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
> >  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
> >  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > new file mode 100644
> > index 00..6a04bea6d6
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -0,0 +1,47 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include 
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vc1.h"
> > +
> > +void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +
> > +av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> > +{
> > +#if HAVE_RVV
> > +int flags = av_get_cpu_flags();
> > +
> > +if ((flags & AV_CPU_FLAG_RVV_I64) && ff_get_rv_vlenb() >= 16) {
> > +dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> > +dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +}
>
> This block can be folded into the next. You don't need to check VLENB
> twice.
>
> > +if ((flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16) {
> > +dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> > +dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +}
> > +#endif
> > +}
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > new file mode 100644
> > index 00..b865bd2cbc
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -0,0 +1,114 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include "libavutil/riscv/asm.S"
> > +
> > +func ff_vc1_inv_trans_8x8_dc_rvv, zve64x
> > +lht2, (a2)
> > +sh1addt2, t2, t2
> > +addi  t2, t2, 1
> > +srai  t2, t2, 1
> > +sh1addt2, t2, t2
> > +addi  t2, t2, 16
> > +   

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-05 Thread flow gg
I'm sorry for my carelessness.It's because I used to build and run
manually, but now I've switched to a script to do it, so I accidentally
missed the error.I will modify the script and to avoid this kind of issue
in the future.

libavcodec/riscv/vc1dsp_rvv.S:35: Error: improper CSRxI immediate

Changed.

Rémi Denis-Courmont  于2023年12月6日周三 04:11写道:

> Le tiistaina 5. joulukuuta 2023, 21.25.12 EET flow gg a écrit :
> > > This block can be folded into the next. You don't need to check VLENB
> >
> > twice.
> >
> > Changed.
> >
> > > Instruction scheduling could be better, especially on in-order CPUs.
> >
> > I put the vload at the front, and then proceeded with the t2 operation,
> but
> > I'm not sure...
> >
> > > You don't need to reset the AVL here, just pass zero.
> >
> > Changed.
> >
> > > vsetivli
> >
> > Changed.
>
> You changed more than I asked for. The immediate AVL is a 5-bit unsigned
> integer, so it should not be possible to assemble 32 or 64, unless you
> have a
> preprocessor that silently rewrites `vsetivli` into `vsetvli` (If so, that
> sounds very iffy because `vsetivli zero` has no scratch X register to work
> with).
>
> FWIW CanMV-K230 boards are on sale for under 500 RMB.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 653b8a8651aaa6c22e8a3a400b3e493eb03704f2 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Fri, 1 Dec 2023 10:07:40 +0800
Subject: [PATCH] lavc/vc1dsp: R-V V inv_trans

c910
vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
---
 libavcodec/riscv/Makefile  |   2 +
 libavcodec/riscv/vc1dsp_init.c |  49 ++
 libavcodec/riscv/vc1dsp_rvv.S  | 113 +
 libavcodec/vc1dsp.c|   2 +
 libavcodec/vc1dsp.h|   1 +
 5 files changed, 167 insertions(+)
 create mode 100644 libavcodec/riscv/vc1dsp_init.c
 create mode 100644 libavcodec/riscv/vc1dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 2d0e6c19c8..442c5961ea 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -39,5 +39,7 @@ OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_init.o \
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_init.o
 RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) += riscv/utvideodsp_rvv.o
+OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
+RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vc1dsp_init.c b/libavcodec/riscv/vc1dsp_init.c
new file mode 100644
index 00..0d22d28f4d
--- /dev/null
+++ b/libavcodec/riscv/vc1dsp_init.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include 
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/vc1.h"
+
+void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+
+av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flag

Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-05 Thread flow gg
> FWIW CanMV-K230 boards are on sale for under 500 RMB.

I just made a payment ~ (I saw you mention in IRC that you're going to
write about K230+Debian. Looking forward to it)

Rémi Denis-Courmont  于2023年12月6日周三 04:11写道:

> Le tiistaina 5. joulukuuta 2023, 21.25.12 EET flow gg a écrit :
> > > This block can be folded into the next. You don't need to check VLENB
> >
> > twice.
> >
> > Changed.
> >
> > > Instruction scheduling could be better, especially on in-order CPUs.
> >
> > I put the vload at the front, and then proceeded with the t2 operation,
> but
> > I'm not sure...
> >
> > > You don't need to reset the AVL here, just pass zero.
> >
> > Changed.
> >
> > > vsetivli
> >
> > Changed.
>
> You changed more than I asked for. The immediate AVL is a 5-bit unsigned
> integer, so it should not be possible to assemble 32 or 64, unless you
> have a
> preprocessor that silently rewrites `vsetivli` into `vsetvli` (If so, that
> sounds very iffy because `vsetivli zero` has no scratch X register to work
> with).
>
> FWIW CanMV-K230 boards are on sale for under 500 RMB.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V inv_trans

2023-12-07 Thread flow gg
Hello, I have received the K230, and then installed Debian following your
method. Therefore, I have updated the benchmark of K230 in the patch of
this reply.

k230
vc1dsp.vc1_inv_trans_4x4_dc_c: 125.7
vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 53.5
vc1dsp.vc1_inv_trans_4x8_dc_c: 230.7
vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 65.5
vc1dsp.vc1_inv_trans_8x4_dc_c: 228.7
vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 64.5
vc1dsp.vc1_inv_trans_8x8_dc_c: 476.5
vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 80.2



Rémi Denis-Courmont  于2023年12月4日周一 01:17写道:

> Le sunnuntaina 3. joulukuuta 2023, 16.40.08 EET flow gg a écrit :
> > c910
> > vc1dsp.vc1_inv_trans_4x4_dc_c: 84.0
> > vc1dsp.vc1_inv_trans_4x4_dc_rvv_i32: 74.0
> > vc1dsp.vc1_inv_trans_4x8_dc_c: 150.2
> > vc1dsp.vc1_inv_trans_4x8_dc_rvv_i32: 83.5
> > vc1dsp.vc1_inv_trans_8x4_dc_c: 129.0
> > vc1dsp.vc1_inv_trans_8x4_dc_rvv_i64: 75.7
> > vc1dsp.vc1_inv_trans_8x8_dc_c: 254.7
> > vc1dsp.vc1_inv_trans_8x8_dc_rvv_i64: 90.5
>
> The code below uses fractional multipliers, so I infer that the
> benchmarked
> code was significantly different, and the measurements are not really
> worth the
> bother.
>
> I know that supply is a problem at the moment, but I if you are going to
> keep
> this up, I would hope that ISCAS can get you access to an RVV 1.0 board.
>
> In-line...
>
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > new file mode 100644
> > index 00..88e0434f0e
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -0,0 +1,47 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301
> > USA
> > + */
> > +
> > +#include 
> > +
> > +#include "libavutil/attributes.h"
> > +#include "libavutil/cpu.h"
> > +#include "libavutil/riscv/cpu.h"
> > +#include "libavcodec/vc1.h"
> > +
> > +void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride,
> int16_t
> > *block);
> > +
> > +av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> > +{
> > +#if HAVE_RVV
> > +int flags = av_get_cpu_flags();
> > +
> > +if (flags & AV_CPU_FLAG_RVV_I64) {
> > +dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> > +dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +}
> > +if (flags & AV_CPU_FLAG_RVV_I32) {
> > +dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> > +dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +}
>
> Probably missing VLENB checks.
>
> > +#endif
> > +}
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > new file mode 100644
> > index 00..8a6b27192a
> > --- /dev/null
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -0,0 +1,123 @@
> > +/*
> > + * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences
> > (ISCAS).
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is

Re: [FFmpeg-devel] [PATCH 2/2] lavc/aacencdsp: R-V V abs_pow34

2023-12-09 Thread flow gg
Updated the patch to resolve conflicts, updated m4 to m8, using c908's
benchmark.

flow gg  于2023年11月29日周三 01:00写道:

> c910:
> abs_pow34_c: 24610.7
> abs_pow34_rvv_f32: 6177.7
>
> (need use "[FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34" first)
>
From 7dd1efb25e4cf6f98afde90bb472002d3aba1830 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 28 Nov 2023 20:14:14 +0800
Subject: [PATCH 2/2] lavc/aacencdsp: R-V V abs_pow34

C908:
abs_pow34_c: 535.5
abs_pow34_rvv_f32: 337.2
---
 libavcodec/aacenc.c   |  4 +++
 libavcodec/aacenc.h   |  1 +
 libavcodec/riscv/Makefile |  2 ++
 libavcodec/riscv/aacencdsp_init.c | 42 +++
 libavcodec/riscv/aacencdsp_rvv.S  | 38 
 5 files changed, 87 insertions(+)
 create mode 100644 libavcodec/riscv/aacencdsp_init.c
 create mode 100644 libavcodec/riscv/aacencdsp_rvv.S

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index 443b25e25a..55c4bf55ce 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -1440,6 +1440,10 @@ void ff_aac_dsp_init(AACEncContext *s){
 s->abs_pow34   = abs_pow34_v;
 s->quant_bands = quantize_bands;
 
+#if ARCH_RISCV
+ff_aac_dsp_init_riscv(s);
+#endif
+
 #if ARCH_X86
 ff_aac_dsp_init_x86(s);
 #endif
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index 09dd8639be..18b424736d 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -155,6 +155,7 @@ typedef struct AACEncContext {
 } AACEncContext;
 
 void ff_aac_dsp_init(AACEncContext *s);
+void ff_aac_dsp_init_riscv(AACEncContext *s);
 void ff_aac_dsp_init_x86(AACEncContext *s);
 void ff_aac_coder_init_mips(AACEncContext *c);
 void ff_quantize_band_cost_cache_init(struct AACEncContext *s);
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index e9825c0856..69805e3b19 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -1,5 +1,7 @@
 OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_init.o riscv/sbrdsp_init.o
 RVV-OBJS-$(CONFIG_AAC_DECODER) += riscv/aacpsdsp_rvv.o riscv/sbrdsp_rvv.o
+OBJS-$(CONFIG_AAC_ENCODER) += riscv/aacencdsp_init.o
+RVV-OBJS-$(CONFIG_AAC_ENCODER) += riscv/aacencdsp_rvv.o
 OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_init.o
 RV-OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_rvb.o
 RVV-OBJS-$(CONFIG_AC3DSP) += riscv/ac3dsp_rvv.o
diff --git a/libavcodec/riscv/aacencdsp_init.c b/libavcodec/riscv/aacencdsp_init.c
new file mode 100644
index 00..83ae16f46b
--- /dev/null
+++ b/libavcodec/riscv/aacencdsp_init.c
@@ -0,0 +1,42 @@
+/*
+ * AAC encoder assembly optimizations
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/float_dsp.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/aacenc.h"
+
+void ff_abs_pow34_rvv(float *out, const float *in, const int size);
+
+av_cold void ff_aac_dsp_init_riscv(AACEncContext *s)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if (flags & AV_CPU_FLAG_RVV_F32) {
+if (flags & AV_CPU_FLAG_RVB_ADDR) {
+s->abs_pow34 = ff_abs_pow34_rvv;
+}
+}
+#endif
+}
diff --git a/libavcodec/riscv/aacencdsp_rvv.S b/libavcodec/riscv/aacencdsp_rvv.S
new file mode 100644
index 00..4c7a874d77
--- /dev/null
+++ b/libavcodec/riscv/aacencdsp_rvv.S
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2023 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; 

Re: [FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-12-09 Thread flow gg
There's a strange issue:

Adding tests can compile successfully on x86 and lichee4a (risc v), but it
results in an error on k230.

> collect2: fatal error: ld terminated with signal 9 [Killed]
> compilation terminated.
>
> [32833.539109]
oom-kill:constraint=CONSTRAINT_NONE,nodemask=(null),task=ld,pid=6804,uid=1000
> [32833.547321] Out of memory: Killed process 6804 (ld) total-vm:363180kB,
anon-rss:357536kB, file-rss:932kB, shmem-rss:0kB, UID:1000 pgtables:732kB
oom_score_adj:0
> [32833.653223] oom_reaper: reaped process 6804 (ld), now anon-rss:0kB,
file-rss:0kB, shmem-rss:0kB

If I remove the line 1429 with FF_CODEC_ENCODE_CB(aac_encode_frame), there
is no error on k230, but I am unsure of the reason.

flow gg  于2023年12月5日周二 05:46写道:

> Because there was a conflict, the patch was updated in the reply
>
> flow gg  于2023年12月1日周五 04:25写道:
>
>> Okay,  I splited and attached
>>
>>
>>
>> Rémi Denis-Courmont  于2023年11月30日周四 23:31写道:
>>
>>> Le tiistaina 28. marraskuuta 2023, 18.59.38 EET flow gg a écrit :
>>> >
>>>
>>> Since nobody else commented, I shall note that you should probably split
>>> the
>>> underlying lavc changes into a separate preliminary patch.
>>>
>>> --
>>> レミ・デニ-クールモン
>>> http://www.remlab.net/
>>>
>>>
>>>
>>> ___
>>> ffmpeg-devel mailing list
>>> ffmpeg-devel@ffmpeg.org
>>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>>
>>> To unsubscribe, visit link above, or email
>>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>>
>>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/2] checkasm: test for abs_pow34

2023-12-09 Thread flow gg
To express clearly,I mean remove
libavcodec/aacenc.c:1429 FF_CODEC_ENCODE_CB(aac_encode_frame)
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V put_epel h

2024-03-26 Thread flow gg
Okay, changed to use const, updated at this GitHub link (
https://github.com/hleft/FFmpeg/tree/vp8vp9)

Rémi Denis-Courmont  于2024年3月27日周三 02:38写道:

> Le perjantaina 22. maaliskuuta 2024, 8.01.00 EET flow gg a écrit :
> > (This should be used after applying these 4 patches)
> >
> > ```
> > [FFmpeg-devel] [PATCH] lavc/vp8dsp: R-V V put_vp8_pixels
> > [FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V put_bilin_h
> > 1-3
> > ```
>
> In general, I am not sure that it is safe to lay constant data out without
> specifying a section.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/7] lavc/vp9dsp: R-V mc copy_avg

2024-03-26 Thread flow gg
Hi, here's the github link (https://github.com/hleft/FFmpeg/tree/vp8vp9)

Rémi Denis-Courmont  于2024年3月27日周三 02:30写道:

> Hi,
>
> Le perjantaina 22. maaliskuuta 2024, 8.12.41 EET flow gg a écrit :
> > It might be a bit inconvenient to find the patches related to vp8, vp9
> that
> > were sent earlier. Here, I've placed them in a zip file in this reply
>
> ZIP files are not particularly convenient. The only easy way for me to
> review
> is if patches are sent with git-send-email. (As for merging, the easiest
> is of
> course to pull them from a git repository.)
>
> This would not be a problem if FFmpeg used a web forge, but hell will
> freeze
> before that.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V put_epel h

2024-03-27 Thread flow gg
Alright,  updated it in this reply

Rémi Denis-Courmont  于2024年3月27日周三 16:18写道:

> Hi,
>
> Le 27 mars 2024 04:37:02 GMT+02:00, flow gg  a
> écrit :
> >Okay, changed to use const, updated at this GitHub link (
> >https://github.com/hleft/FFmpeg/tree/vp8vp9)
>
> OK, that might be easier for *me* to merge but the rule here is to post
> via ML for code review by anybody.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 387ecc8f3acb4c9494388d96e5a46fc1c856b954 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 9 Mar 2024 08:41:31 +0800
Subject: [PATCH 1/3] lavc/vp8dsp: R-V V put_epel h

C908:
vp8_put_epel4_h4_c: 10.7
vp8_put_epel4_h4_rvv_i32: 5.0
vp8_put_epel4_h6_c: 15.0
vp8_put_epel4_h6_rvv_i32: 6.2
vp8_put_epel8_h4_c: 43.2
vp8_put_epel8_h4_rvv_i32: 11.2
vp8_put_epel8_h6_c: 57.5
vp8_put_epel8_h6_rvv_i32: 13.5
vp8_put_epel16_h4_c: 92.5
vp8_put_epel16_h4_rvv_i32: 13.7
vp8_put_epel16_h6_c: 139.0
vp8_put_epel16_h6_rvv_i32: 16.5
---
 libavcodec/riscv/vp8dsp_init.c |   7 +++
 libavcodec/riscv/vp8dsp_rvv.S  | 105 +
 2 files changed, 112 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 02dbda979e..6614d661f7 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -78,6 +78,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 c->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_rvv;
 c->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_rvv;
 c->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_rvv;
+
+c->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_rvv;
+c->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_rvv;
+c->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_rvv;
+c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
+c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
+c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 9d4ffed255..84e8ec61de 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -223,3 +223,108 @@ endfunc
 func ff_put_vp8_bilin4_hv_rvv, zve32x
 put_vp8_bilin_hv 4
 endfunc
+
+const subpel_filters
+.byte 0,  -6, 123,  12,  -1, 0
+.byte 2, -11, 108,  36,  -8, 1
+.byte 0,  -9,  93,  50,  -6, 0
+.byte 3, -16,  77,  77, -16, 3
+.byte 0,  -6,  50,  93,  -9, 0
+.byte 1,  -8,  36, 108, -11, 2
+.byte 0,  -1,  12, 123,  -6, 0
+endconst
+
+.macro epel_filter size
+lla t2, subpel_filters
+addit0, a5, -1
+li  t1, 6
+mul t0, t0, t1
+add t0, t0, t2
+.irp n 1,2,3,4
+lb  t\n, \n(t0)
+.endr
+.ifc \size,6
+lb  t5, 5(t0)
+lb  t0, (t0)
+.endif
+.endm
+
+.macro epel_load dst len size
+addit6, a2, -1
+addia7, a2, 1
+vle8.v  v24, (a2)
+vle8.v  v22, (t6)
+vle8.v  v26, (a7)
+addia7, a7, 1
+vle8.v  v28, (a7)
+vwmulu.vx   v16, v24, t2
+vwmulu.vx   v20, v26, t3
+.ifc \size,6
+addit6, t6, -1
+addia7, a7, 1
+vle8.v  v24, (t6)
+vle8.v  v26, (a7)
+vwmaccu.vx  v16, t0, v24
+vwmaccu.vx  v16, t5, v26
+.endif
+li  t6, 64
+vwmaccsu.vx v16, t1, v22
+vwmaccsu.vx v16, t4, v28
+vwadd.wxv16, v16, t6
+
+.ifc \len,4
+vsetvli zero, zero, e16, mf2, ta, ma
+.elseif \len == 8
+vsetvli zero, zero, e16, m1, ta, ma
+.else
+vsetvli zero, zero, e16, m2, ta, ma
+.endif
+
+vwadd.vvv24, v16, v20
+vnsra.wiv24, v24, 7
+vmax.vx v24, v24, zero
+.ifc \len,4
+vsetvli zero, zero, e8, mf4, ta, ma
+.elseif \len == 8
+vsetvli zero, zero, e8, mf2, ta, ma
+.else
+vsetvli zero, zero, e8, m1, ta, ma
+.endif
+vnclipu.wi  \dst, v24, 0
+.endm
+
+.macro epel_load_inc dst len size
+epel_load   \dst \len \size
+add a2, a2, a3
+.endm
+
+.macro epel len size
+epel_filter \size
+
+.ifc \len,4
+vsetivlizero, 4, e8, mf4, ta, ma
+.elseif \len == 8
+vsetivlizero, 8, e8, mf2, ta, ma
+.else
+

Re: [FFmpeg-devel] [PATCH 2/3] lavc/vp8dsp: R-V V put_epel v

2024-03-27 Thread flow gg
Okay, changed in the reply and github (another reason for not doing so
initially was the thought that there weren't enough registers available,
and that other changes would need to be made that could cause side effects,
but now it's found that the vp8 registers are sufficient.. it's just that
vp9 doesn't have enough)

Rémi Denis-Courmont  于2024年3月27日周三 23:36写道:

> Le perjantaina 22. maaliskuuta 2024, 8.01.21 EET flow gg a écrit :
> >
>
> IMO, you could just as well share the code and avoid most if's. Not like
> one
> additional `li a3, 1` per function call is going to matter in the grand
> scheme
> of things. It might even help by reducing I-cache pressure.
>
> --
> 雷米‧德尼-库尔蒙
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From 920e5274b9fb98fc1ac97d0644a9bb7c890e8f39 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Thu, 21 Mar 2024 17:49:54 +0800
Subject: [PATCH 2/3] lavc/vp8dsp: R-V V put_epel v

C908:
vp8_put_epel4_v4_c: 11.0
vp8_put_epel4_v4_rvv_i32: 5.0
vp8_put_epel4_v6_c: 16.5
vp8_put_epel4_v6_rvv_i32: 6.2
vp8_put_epel8_v4_c: 43.7
vp8_put_epel8_v4_rvv_i32: 11.2
vp8_put_epel8_v6_c: 68.7
vp8_put_epel8_v6_rvv_i32: 13.2
vp8_put_epel16_v4_c: 92.5
vp8_put_epel16_v4_rvv_i32: 13.7
vp8_put_epel16_v6_c: 135.7
vp8_put_epel16_v6_rvv_i32: 16.5
---
 libavcodec/riscv/vp8dsp_init.c |  7 ++
 libavcodec/riscv/vp8dsp_rvv.S  | 46 +++---
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 6614d661f7..2f123b67fe 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -85,6 +85,13 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 c->put_vp8_epel_pixels_tab[0][0][1] = ff_put_vp8_epel16_h4_rvv;
 c->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_rvv;
 c->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_rvv;
+
+c->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_rvv;
+c->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_rvv;
+c->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_rvv;
+c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
+c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
+c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 84e8ec61de..440a965ddd 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -234,9 +234,13 @@ const subpel_filters
 .byte 0,  -1,  12, 123,  -6, 0
 endconst
 
-.macro epel_filter size
+.macro epel_filter size type
 lla t2, subpel_filters
+.ifc \type,v
+addit0, a6, -1
+.elseif \type == h
 addit0, a5, -1
+.endif
 li  t1, 6
 mul t0, t0, t1
 add t0, t0, t2
@@ -249,19 +253,25 @@ endconst
 .endif
 .endm
 
-.macro epel_load dst len size
-addit6, a2, -1
-addia7, a2, 1
+.macro epel_load dst len size type
+.ifc \type,v
+mv  a5, a3
+.else
+li  a5, 1
+.endif
+sub t6, a2, a5
+add a7, a2, a5
+.if \from_mem
 vle8.v  v24, (a2)
 vle8.v  v22, (t6)
 vle8.v  v26, (a7)
-addia7, a7, 1
+add a7, a7, a5
 vle8.v  v28, (a7)
 vwmulu.vx   v16, v24, t2
 vwmulu.vx   v20, v26, t3
 .ifc \size,6
-addit6, t6, -1
-addia7, a7, 1
+sub t6, t6, a5
+add a7, a7, a5
 vle8.v  v24, (t6)
 vle8.v  v26, (a7)
 vwmaccu.vx  v16, t0, v24
@@ -293,13 +303,13 @@ endconst
 vnclipu.wi  \dst, v24, 0
 .endm
 
-.macro epel_load_inc dst len size
-epel_load   \dst \len \size
+.macro epel_load_inc dst len size type
+epel_load   \dst \len \size \type
 add a2, a2, a3
 .endm
 
-.macro epel len size
-epel_filter \size
+.macro epel len size type
+epel_filter \size \type
 
 .ifc \len,4
 vsetivlizero, 4, e8, mf4, ta, ma
@@ -311,7 +321,7 @@ endconst
 
 1:
 addia4, a4, -1
-epel_load_inc   v30 \len \size
+epel_load_inc   v30 \len \size \type
 vse8.v  v30, (a0)
 add a0, a0, a1
 bneza4, 1b
@@ -321,10 +

Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp9dsp: R-V V ipred dc

2024-03-27 Thread flow gg
I don't quite understand, I think here 8x8 because zve64x is not suitable
for sharing, it shares between dc16x16 and dc32x32, there isn't much common
code, it would require adding 3 if-else statements and function parameters,
it feels okay not to extract too.

Rémi Denis-Courmont  于2024年3月27日周三 23:41写道:

> Le perjantaina 22. maaliskuuta 2024, 8.02.08 EET flow gg a écrit :
> > Using macros to shorten function definitions, updated in this response
>
> Did you try to share the common code after getdc and see how slower it is?
> If
> an extra static branch has negligible overhead, it would reduce binary
> size
> quite a bit here, AFAICT.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/4] lavc/vp9dsp: R-V V ipred dc

2024-04-06 Thread flow gg
Okay, updated it in the reply and github(
https://github.com/hleft/FFmpeg/tree/vp8vp9)

Rémi Denis-Courmont  于2024年4月4日周四 04:22写道:

> Le torstaina 28. maaliskuuta 2024, 4.44.33 EEST flow gg a écrit :
> > I don't quite understand, I think here 8x8 because zve64x is not suitable
> > for sharing, it shares between dc16x16 and dc32x32, there isn't much
> common
> > code, it would require adding 3 if-else statements and function
> parameters,
> > it feels okay not to extract too.
>
> I agree that we can't realistically share code between the different block
> sizes. My point was that the code after getdc is lengthy (after expansion)
> and
> fixed for a given block size, so *that* code could be shared and jumped as
> common function tail.
>
> --
> Rémi Denis-Courmont
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From f4e49d6f26c1ed85907a4ef7596dcc7b77cd9b8c Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Mon, 26 Feb 2024 14:42:17 +0800
Subject: [PATCH 08/18] lavc/vp9dsp: R-V V ipred dc

C908:
vp9_dc_8x8_8bpp_c: 46.0
vp9_dc_8x8_8bpp_rvv_i64: 41.0
vp9_dc_16x16_8bpp_c: 109.2
vp9_dc_16x16_8bpp_rvv_i32: 72.7
vp9_dc_32x32_8bpp_c: 365.2
vp9_dc_32x32_8bpp_rvv_i32: 165.5
vp9_dc_127_8x8_8bpp_c: 23.0
vp9_dc_127_8x8_8bpp_rvv_i64: 22.0
vp9_dc_127_16x16_8bpp_c: 70.2
vp9_dc_127_16x16_8bpp_rvv_i32: 50.2
vp9_dc_127_32x32_8bpp_c: 295.2
vp9_dc_127_32x32_8bpp_rvv_i32: 136.7
vp9_dc_128_8x8_8bpp_c: 23.0
vp9_dc_128_8x8_8bpp_rvv_i64: 22.0
vp9_dc_128_16x16_8bpp_c: 70.2
vp9_dc_128_16x16_8bpp_rvv_i32: 50.2
vp9_dc_128_32x32_8bpp_c: 295.2
vp9_dc_128_32x32_8bpp_rvv_i32: 136.7
vp9_dc_129_8x8_8bpp_c: 23.0
vp9_dc_129_8x8_8bpp_rvv_i64: 22.0
vp9_dc_129_16x16_8bpp_c: 70.2
vp9_dc_129_16x16_8bpp_rvv_i32: 50.2
vp9_dc_129_32x32_8bpp_c: 295.2
vp9_dc_129_32x32_8bpp_rvv_i32: 136.7
vp9_dc_left_8x8_8bpp_c: 38.0
vp9_dc_left_8x8_8bpp_rvv_i64: 36.0
vp9_dc_left_16x16_8bpp_c: 93.2
vp9_dc_left_16x16_8bpp_rvv_i32: 67.7
vp9_dc_left_32x32_8bpp_c: 333.2
vp9_dc_left_32x32_8bpp_rvv_i32: 158.5
vp9_dc_top_8x8_8bpp_c: 38.7
vp9_dc_top_8x8_8bpp_rvv_i64: 36.0
vp9_dc_top_16x16_8bpp_c: 93.2
vp9_dc_top_16x16_8bpp_rvv_i32: 67.7
vp9_dc_top_32x32_8bpp_c: 333.2
vp9_dc_top_32x32_8bpp_rvv_i32: 156.2
---
 libavcodec/riscv/Makefile|   2 +
 libavcodec/riscv/vp9_intra_rvv.S | 115 +
 libavcodec/riscv/vp9dsp.h| 171 +++
 libavcodec/riscv/vp9dsp_init.c   |  61 +++
 libavcodec/vp9dsp.c  |   2 +
 libavcodec/vp9dsp.h  |   1 +
 6 files changed, 352 insertions(+)
 create mode 100644 libavcodec/riscv/vp9_intra_rvv.S
 create mode 100644 libavcodec/riscv/vp9dsp.h
 create mode 100644 libavcodec/riscv/vp9dsp_init.c

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 6c2ce3001a..69ccd0896d 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -58,5 +58,7 @@ OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
 RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
 OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
 RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
+OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
+RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
 OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
 RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
new file mode 100644
index 00..db9774c263
--- /dev/null
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/riscv/asm.S"
+
+.macro avgdc size
+vwredsumu.vs v16, v8, v16
+vsetivli zero, 1, e16, m1, ta, ma
+vmv.x.s  t1, v16
+addi t1, t1, 1 << (\size - 1)
+srai t1, t1, \size
+.endm
+
+.macro getdc ty

Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels

2024-04-06 Thread flow gg
ping

flow gg  于2024年3月8日周五 17:46写道:

> Alright, using m8, but for now don't add code to address dependencies in
> loops that have a minor impact. Updated in the reply
>
> Rémi Denis-Courmont  于2024年3月8日周五 17:08写道:
>
>>
>>
>> Le 8 mars 2024 02:45:46 GMT+02:00, flow gg  a
>> écrit :
>> >> Isn't it also faster to max LMUL for the adds here?
>> >
>> >It requires the use of one more vset, making the time slightly longer:
>> >147.7 (m1), 148.7 (m8 + vset).
>>
>> A variation of 0.6% on a single set of kernels will end up below
>> measurement noise in real overall codec usage. And then reducing the
>> I-cache contention can improve performance in other ways. Larger LMUL
>> should also improve performance on bigger cores with more ALUs. So it's not
>> all black and white.
>>
>> My personal preference is to keep the code small if it makes almost no
>> difference but I'm not BDFL.
>>
>> >Also this might not be much noticeable on C908, but avoiding sequential
>> >dependencies on the address registers may help. I mean, avoid using as
>> >address
>> >operand a value that was calculated by the immediate previous
>> instruction.
>> >
>> >> Okay, but the test results haven't changed..
>> >It would add more than ten lines of code, perhaps shorter code will
>> better?
>>
>> I don't know. There are definitely in-order vector cores coming, and data
>> dependencies will hurt them. But I don't know if anyone will care about
>> FFmpeg on those.
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V loop_filter_simple

2024-04-20 Thread flow gg

From 2f516e0236bd84d78ce6fd7e55c4b1a3c9d99baa Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 20 Apr 2024 23:32:10 +0800
Subject: [PATCH 1/3] lavc/vp8dsp: R-V V loop_filter_simple

C908:
vp8_loop_filter_simple_h_c: 416.0
vp8_loop_filter_simple_h_rvv_i32: 187.5
vp8_loop_filter_simple_v_c: 429.7
vp8_loop_filter_simple_v_rvv_i32: 104.0
---
 libavcodec/riscv/vp8dsp_init.c |   5 ++
 libavcodec/riscv/vp8dsp_rvv.S  | 105 +
 2 files changed, 110 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 2dd583d079..46ca71ed04 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -38,6 +38,8 @@ VP8_BILIN(16, rvv);
 VP8_BILIN(8,  rvv);
 VP8_BILIN(4,  rvv);
 
+VP8_LF(rvv);
+
 av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
 {
 #if HAVE_RVV
@@ -120,6 +122,9 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 if (flags & AV_CPU_FLAG_RVB_ADDR) {
 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
 }
+
+c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
+c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index ba644f0f47..2eadfc5766 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -72,6 +72,111 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
 ret
 endfunc
 
+.macro filter_fmin len a f1 p0f2 q0f1
+.ifc \len,16
+vsetvli zero, zero, e16, m2, ta, ma
+.else
+vsetvli zero, zero, e16, m1, ta, ma
+.endif
+vsext.vf2   \q0f1, \a
+vmin.vx \p0f2, \q0f1, a7
+vmin.vx \q0f1, \q0f1, t3
+vadd.vi \p0f2, \p0f2, 3
+vadd.vi \q0f1, \q0f1, 4
+vsra.vi \p0f2, \p0f2, 3
+vsra.vi \f1,   \q0f1, 3
+vadd.vv \p0f2, \p0f2, v8
+vsub.vv \q0f1, v16, \f1
+vmax.vx \p0f2, \p0f2, zero
+vmax.vx \q0f1, \q0f1, zero
+.endm
+
+.macro filter len type normal inner dst stride fE fI thresh
+.ifc \type,v
+sllia6, \stride, 1
+sub t2, \dst, a6
+add t4, \dst, \stride
+sub t1, \dst, \stride
+vle8.v  v1, (t2)
+vle8.v  v11, (t4)
+vle8.v  v17, (t1)
+vle8.v  v22, (\dst)
+.else
+addit1, \dst, -1
+addia6, \dst, -2
+addit4, \dst, 1
+vlse8.v v1, (a6), \stride
+vlse8.v v11, (t4), \stride
+vlse8.v v17, (t1), \stride
+vlse8.v v22, (\dst), \stride
+.endif
+vwsubu.vv   v12, v1, v11 // p1-q1
+vwsubu.vv   v24, v22, v17// q0-p0
+vnclip.wi   v23, v12, 0
+
+.ifc \len,16
+vsetvli zero, zero, e16, m2, ta, ma
+.else
+vsetvli zero, zero, e16, m1, ta, ma
+.endif
+
+// vp8_simple_limit(dst + i, stride, flim)
+li  a7, 2
+vneg.v  v18, v12
+vmax.vv v18, v18, v12
+vneg.v  v8, v24
+vmax.vv v8, v8, v24
+vsrl.vi v18, v18, 1
+vmacc.vxv18, a7, v8
+vmsleu.vx   v0, v18, \fE
+
+li  t5, 3
+li  a7, 124
+li  t3, 123
+vsext.vf2   v4, v23
+vzext.vf2   v8, v17  // p0
+vzext.vf2   v16, v22 // q0
+vmul.vx v30, v24, t5
+vadd.vv v12, v30, v4
+
+.ifc \len,16
+vsetvli zero, zero, e8, m1, ta, ma
+.else
+vsetvli zero, zero, e8, mf2, ta, ma
+.endif
+vnclip.wi   v11, v12, 0
+filter_fmin \len v11 v24 v4 v6
+
+.ifc \len,16
+vsetvli zero, zero, e8, m1, ta, ma
+.else
+vsetvli zero, zero, e8, mf2, ta, ma
+.endif
+vnclipu.wi  v4, v4, 0
+vnclipu.wi  v6, v6, 0
+
+.ifc \type,v
+vse8.v  v4, (t1), v0.t
+vse8.v  v6, (\dst), v0.t
+.else
+vsse8.v v4, (t1), \stride, v0.t
+vsse8.v v6, (\dst), \stride, v0.t
+.endif
+
+.endm
+
+func ff_vp8_v_loop_filter16_simple_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 v 0 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
+func ff_vp8_h_loop_filter16_simple_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 h 0 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
 .macro put_vp8_pixels
 1:
 addi  a4, a4, -1
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-

[FFmpeg-devel] [PATCH 2/3] lavc/vp8dsp: R-V V loop_filter_inner

2024-04-20 Thread flow gg

From c033ab8d30135dc02b09b1747c0761baefdcbb4a Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 20 Apr 2024 23:13:07 +0800
Subject: [PATCH 2/3] lavc/vp8dsp: R-V V loop_filter_inner

C908:
vp8_loop_filter8uv_inner_v_c: 738.2
vp8_loop_filter8uv_inner_v_rvv_i32: 455.2
vp8_loop_filter16y_inner_h_c: 685.0
vp8_loop_filter16y_inner_h_rvv_i32: 497.0
vp8_loop_filter16y_inner_v_c: 743.7
vp8_loop_filter16y_inner_v_rvv_i32: 295.7
---
 libavcodec/riscv/vp8dsp_init.c |   4 ++
 libavcodec/riscv/vp8dsp_rvv.S  | 110 +
 2 files changed, 114 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index 46ca71ed04..aa95021df5 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -123,6 +123,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
 }
 
+c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
+c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
+c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
+
 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_rvv;
 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_rvv;
 }
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index 2eadfc5766..f10e269d9d 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -72,6 +72,13 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
 ret
 endfunc
 
+.macro filter_abs dst diff fI
+vneg.v  v8, \diff
+vmax.vv \dst, v8, \diff
+vmsleu.vx   v8, \dst, \fI
+vmand.mmv27, v27, v8
+.endm
+
 .macro filter_fmin len a f1 p0f2 q0f1
 .ifc \len,16
 vsetvli zero, zero, e16, m2, ta, ma
@@ -101,6 +108,16 @@ endfunc
 vle8.v  v11, (t4)
 vle8.v  v17, (t1)
 vle8.v  v22, (\dst)
+.if \normal
+sub t3, t2, a6
+sub t0, t1, a6
+add t6, \dst, a6
+add a7, t4, a6
+vle8.v  v2, (t3)
+vle8.v  v15, (t0)
+vle8.v  v10, (t6)
+vle8.v  v14, (a7)
+.endif
 .else
 addit1, \dst, -1
 addia6, \dst, -2
@@ -109,9 +126,28 @@ endfunc
 vlse8.v v11, (t4), \stride
 vlse8.v v17, (t1), \stride
 vlse8.v v22, (\dst), \stride
+.if \normal
+addit5, \dst, -4
+addit0, \dst, -3
+addit6, \dst, 2
+addia7, \dst, 3
+vlse8.v v2, (t5), \stride
+vlse8.v v15, (t0), \stride
+vlse8.v v10, (t6), \stride
+vlse8.v v14, (a7), \stride
+.endif
 .endif
 vwsubu.vv   v12, v1, v11 // p1-q1
 vwsubu.vv   v24, v22, v17// q0-p0
+
+.if \normal
+vwsubu.vv   v30, v1, v17
+vwsubu.vv   v20, v11, v22
+vwsubu.vv   v28, v1, v15
+vwsubu.vv   v4, v2, v15
+vwsubu.vv   v6, v10, v11
+vwsubu.vv   v2, v14, v10
+.endif
 vnclip.wi   v23, v12, 0
 
 .ifc \len,16
@@ -130,6 +166,26 @@ endfunc
 vmacc.vxv18, a7, v8
 vmsleu.vx   v0, v18, \fE
 
+.if \normal
+vneg.v  v18, v30
+vmax.vv v30, v18, v30
+vmsleu.vx   v27, v30, \fI
+filter_abs  v18 v28 \fI
+filter_abs  v18 v4 \fI
+filter_abs  v18 v6 \fI
+filter_abs  v18 v2 \fI
+filter_abs  v20 v20 \fI
+vmand.mmv27, v0, v27 // vp8_simple_limit && normal
+
+vmsgtu.vx   v20, v20, \thresh// hev
+vmsgtu.vx   v3, v30, \thresh
+vmor.mm v3, v3, v20  // v3 = hev: > thresh
+vzext.vf2   v18, v1  // v18 = p1
+vmand.mmv0, v27, v3  // v0 = normal && hev
+vzext.vf2   v20, v11 // v12 = q1
+vmnot.m v3, v3   // v3 = !hv
+.endif
+
 li  t5, 3
 li  a7, 124
 li  t3, 123
@@ -163,8 +219,62 @@ endfunc
 vsse8.v v6, (\dst), \stride, v0.t
 .endif
 
+.if \normal
+vmand.mmv0, v27, v3  // vp8_normal_limit & !hv
+
+.if \inner
+vnclip.wi   v30, v30, 0
+filter_fmin \len v30 v24 v4 v6
+vadd.vi v24, v24, 1
+vsra.vi v24, v24, 1  // (f1 + 1) >> 1;
+vadd.vv v8, v18, v24
+vsub.vv v10, v20, v24
+.endif
+
+vmax.vx v8, v8, zero
+vmax.vx v10, v10, zero
+.ifc \len,16
+vsetvli zero, zer

[FFmpeg-devel] [PATCH 3/3] lavc/vp8dsp: R-V V loop_filter

2024-04-20 Thread flow gg

From cff79c9500b94f4c0abdd9cd68c91cc736366c78 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sat, 20 Apr 2024 23:26:58 +0800
Subject: [PATCH 3/3] lavc/vp8dsp: R-V V loop_filter

C908:
vp8_loop_filter8uv_v_c: 745.5
vp8_loop_filter8uv_v_rvv_i32: 467.2
vp8_loop_filter16y_h_c: 674.2
vp8_loop_filter16y_h_rvv_i32: 553.0
vp8_loop_filter16y_v_c: 732.7
vp8_loop_filter16y_v_rvv_i32: 324.5
---
 libavcodec/riscv/vp8dsp_init.c |  4 +++
 libavcodec/riscv/vp8dsp_rvv.S  | 63 ++
 2 files changed, 67 insertions(+)

diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c
index aa95021df5..597e6acec8 100644
--- a/libavcodec/riscv/vp8dsp_init.c
+++ b/libavcodec/riscv/vp8dsp_init.c
@@ -123,6 +123,10 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c)
 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv;
 }
 
+c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_rvv;
+c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_rvv;
+c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_rvv;
+
 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_rvv;
 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_rvv;
 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_rvv;
diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
index f10e269d9d..af28ea5258 100644
--- a/libavcodec/riscv/vp8dsp_rvv.S
+++ b/libavcodec/riscv/vp8dsp_rvv.S
@@ -229,6 +229,39 @@ endfunc
 vsra.vi v24, v24, 1  // (f1 + 1) >> 1;
 vadd.vv v8, v18, v24
 vsub.vv v10, v20, v24
+.else
+li  t5, 27
+li  t3, 9
+li  a7, 18
+vwmul.vxv2, v11, t5
+vwmul.vxv6, v11, t3
+vwmul.vxv4, v11, a7
+
+.ifc \len,16
+vsetvli zero, zero, e16, m2, ta, ma
+.else
+vsetvli zero, zero, e16, m1, ta, ma
+.endif
+
+li  a7, 63
+vzext.vf2   v14, v15 // p2
+vzext.vf2   v24, v10 // q2
+vadd.vx v2, v2, a7
+vadd.vx v4, v4, a7
+vadd.vx v6, v6, a7
+vsra.vi v2, v2, 7// a0
+vsra.vi v12, v4, 7   // a1
+vsra.vi v6, v6, 7// a2
+vadd.vv v14, v14, v6 // p2 + a2
+vsub.vv v22, v24, v6 // q2 - a2
+vsub.vv v10, v20, v12// q1 - a1
+vadd.vv v4, v8, v2   // p0 + a0
+vsub.vv v6, v16, v2  // q0 - a0
+vadd.vv v8, v12, v18 // a1 + p1
+vmax.vx v4, v4, zero
+vmax.vx v6, v6, zero
+vmax.vx v14, v14, zero
+vmax.vx v16, v22, zero
 .endif
 
 vmax.vx v8, v8, zero
@@ -253,6 +286,17 @@ endfunc
 vsse8.v v6, (a6), \stride, v0.t
 vsse8.v v7, (t4), \stride, v0.t
 .endif
+.if !\inner
+vnclipu.wi  v14, v14, 0
+vnclipu.wi  v16, v16, 0
+.ifc \type,v
+vse8.v  v14, (t0), v0.t
+vse8.v  v16, (t6), v0.t
+.else
+vsse8.v v14, (t0), \stride, v0.t
+vsse8.v v16, (t6), \stride, v0.t
+.endif
+.endif
 .endif
 .endm
 
@@ -275,6 +319,25 @@ func ff_vp8_v_loop_filter8uv_inner_rvv, zve32x
 ret
 endfunc
 
+func ff_vp8_v_loop_filter16_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 v 1 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
+func ff_vp8_h_loop_filter16_rvv, zve32x
+vsetivlizero, 16, e8, m1, ta, ma
+filter 16 h 1 0 a0 a1 a2 a3 a4
+ret
+endfunc
+
+func ff_vp8_v_loop_filter8uv_rvv, zve32x
+vsetivlizero, 8, e8, mf2, ta, ma
+filter 8 v 1 0 a0 a2 a3 a4 a5
+filter 8 v 1 0 a1 a2 a3 a4 a5
+ret
+endfunc
+
 func ff_vp8_v_loop_filter16_simple_rvv, zve32x
 vsetivlizero, 16, e8, m1, ta, ma
 filter 16 v 0 0 a0 a1 a2 a3 a4
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/3] lavc/vp8dsp: R-V V loop_filter_simple

2024-04-20 Thread flow gg
github link: https://github.com/hleft/FFmpeg/tree/vp8vp9

flow gg  于2024年4月20日周六 23:55写道:

>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/2] lavc/vc1dsp: R-V V mspel_pixels

2024-04-29 Thread flow gg
Happy to see you back :)

Rémi Denis-Courmont  于2024年4月29日周一 02:06写道:

> Le sunnuntaina 7. huhtikuuta 2024, 8.38.54 EEST flow gg a écrit :
> > ping
>
> I have been away for a while, and catching up takes time, sorry.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/2] checkasm/blockdsp: add fill_block test

2024-04-29 Thread flow gg

From 0c196a37cb4036d8c618c06c02a011b910cc56ce Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Mon, 29 Apr 2024 14:18:23 +0800
Subject: [PATCH 1/2] checkasm/blockdsp: add fill_block test

---
 tests/checkasm/blockdsp.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index 22a2f79455..355e111d43 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -29,6 +29,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 
+typedef struct {
+const char *name;
+int size;
+} test;
+
 #define randomize_buffers(size) \
 do {\
 int i;  \
@@ -52,6 +57,31 @@ do {\
 }   \
 } while (0)
 
+static void check_fill(BlockDSPContext *h){
+const test tests[] = {
+{"fill_block_tab[0]", 16},
+{"fill_block_tab[1]", 8},
+};
+const int n = 16;
+
+LOCAL_ALIGNED_32(uint8_t, buf0, [16 * 32]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [16 * 32]);
+
+for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+declare_func(void, uint8_t *block, uint8_t value,
+ ptrdiff_t line_size, int h);
+if (check_func(h->fill_block_tab[t], "blockdsp.%s", tests[t].name)) {
+uint8_t value = rnd();
+randomize_buffers(tests[t].size);
+call_ref(buf0, value, 16, n);
+call_new(buf1, value, 16, n);
+if (memcmp(buf0, buf1, sizeof(*buf0) * tests[t].size * n))
+fail();
+bench_new(buf0, value, 16, n);
+}
+}
+}
+
 void checkasm_check_blockdsp(void)
 {
 LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
@@ -64,5 +94,7 @@ void checkasm_check_blockdsp(void)
 check_clear(clear_block,  8 * 8);
 check_clear(clear_blocks, 8 * 8 * 6);
 
+check_fill(&h);
+
 report("blockdsp");
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg

From 4315f4e4774e3006d7cc55b6d235cb80e0173cf9 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 6 Mar 2024 12:46:03 +0800
Subject: [PATCH 2/2] lavc/blockdsp: R-V V fill_block

C908:
blockdsp.fill_block_tab[0]_c: 550.0
blockdsp.fill_block_tab[0]_rvv_i64: 48.2
blockdsp.fill_block_tab[1]_c: 148.7
blockdsp.fill_block_tab[1]_rvv_i64: 29.7
---
 libavcodec/riscv/blockdsp_init.c |  6 ++
 libavcodec/riscv/blockdsp_rvv.S  | 21 +
 2 files changed, 27 insertions(+)

diff --git a/libavcodec/riscv/blockdsp_init.c b/libavcodec/riscv/blockdsp_init.c
index 59b2f9d47b..42c8e87fa7 100644
--- a/libavcodec/riscv/blockdsp_init.c
+++ b/libavcodec/riscv/blockdsp_init.c
@@ -27,6 +27,10 @@
 
 void ff_clear_block_rvv(int16_t *block);
 void ff_clear_blocks_rvv(int16_t *block);
+void ff_fill_block16_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
+void ff_fill_block8_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
 
 av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 {
@@ -36,6 +40,8 @@ av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 if (flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb() >= 16) {
 c->clear_block = ff_clear_block_rvv;
 c->clear_blocks = ff_clear_blocks_rvv;
+c->fill_block_tab[0] = ff_fill_block16_rvv;
+c->fill_block_tab[1] = ff_fill_block8_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/blockdsp_rvv.S b/libavcodec/riscv/blockdsp_rvv.S
index 8bb00bb467..71d72cce56 100644
--- a/libavcodec/riscv/blockdsp_rvv.S
+++ b/libavcodec/riscv/blockdsp_rvv.S
@@ -40,3 +40,24 @@ func ff_clear_blocks_rvv, zve64x
 
 ret
 endfunc
+
+func ff_fill_block16_rvv, zve32x
+vsetivli  t0, 16, e8, m1, ta, ma
+vmv.v.x   v8, a1
+1:
+addi  a3, a3, -1
+vse8.vv8, (a0)
+add   a0, a0, a2
+bnez  a3, 1b
+
+ret
+endfunc
+
+func ff_fill_block8_rvv, zve64x
+vsetvli   t0, zero, e8, m8, ta, ma
+vmv.v.x   v8, a1
+vsetvli   t0, a3, e64, m8, ta, ma
+vsse64.v  v8, (a0), a2
+
+ret
+endfunc
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V ipred vert

2024-04-29 Thread flow gg
updated it in the reply and https://github.com/hleft/FFmpeg/tree/vp8vp9

Rémi Denis-Courmont  于2024年4月30日周二 01:57写道:

> Le perjantaina 22. maaliskuuta 2024, 8.02.38 EEST flow gg a écrit :
> > Because the previous patch was updated, so it was updated in this
> response
>
> Seemingly needs rebase since April 7.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
From f9a3d9d10536520c8a0b34de46fd5804796207ac Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Sun, 7 Apr 2024 13:21:02 +0800
Subject: [PATCH 08/20] lavc/vp9dsp: R-V V ipred vert

C908:
vp9_vert_8x8_8bpp_c: 22.0
vp9_vert_8x8_8bpp_rvv_i64: 18.5
vp9_vert_16x16_8bpp_c: 71.2
vp9_vert_16x16_8bpp_rvv_i32: 50.7
vp9_vert_32x32_8bpp_c: 300.2
vp9_vert_32x32_8bpp_rvv_i32: 136.7
---
 libavcodec/riscv/vp9_intra_rvv.S | 35 
 libavcodec/riscv/vp9dsp.h|  6 ++
 libavcodec/riscv/vp9dsp_init.c   |  3 +++
 3 files changed, 44 insertions(+)

diff --git a/libavcodec/riscv/vp9_intra_rvv.S b/libavcodec/riscv/vp9_intra_rvv.S
index db9774c263..b5f0f9d3c3 100644
--- a/libavcodec/riscv/vp9_intra_rvv.S
+++ b/libavcodec/riscv/vp9_intra_rvv.S
@@ -113,3 +113,38 @@ func_dc dc_left  8   left 3  0  zve64x
 func_dc dc_top   32  top  5  1  zve32x
 func_dc dc_top   16  top  4  1  zve32x
 func_dc dc_top   8   top  3  0  zve64x
+
+func ff_v_32x32_rvv, zve32x
+vsetivli zero, 8, e8, mf2, ta, ma
+vle32.v  v8, (a3)
+
+.rept 31
+vse32.v  v8, (a0)
+add  a0, a0, a1
+.endr
+vse32.v  v8, (a0)
+
+ret
+endfunc
+
+func ff_v_16x16_rvv, zve32x
+vsetivli zero, 4, e8, mf4, ta, ma
+vle32.v  v8, (a3)
+
+.rept 15
+vse32.v  v8, (a0)
+add  a0, a0, a1
+.endr
+vse32.v  v8, (a0)
+
+ret
+endfunc
+
+func ff_v_8x8_rvv, zve64x
+ld   t0, (a3)
+vsetivli zero, 8, e64, m4, ta, ma
+vmv.v.x  v8, t0
+vsse64.v v8, (a0), a1
+
+ret
+endfunc
diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
index 25047ed507..113397ce86 100644
--- a/libavcodec/riscv/vp9dsp.h
+++ b/libavcodec/riscv/vp9dsp.h
@@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
  const uint8_t *a);
 void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
const uint8_t *a);
+void ff_v_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+const uint8_t *a);
+void ff_v_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+const uint8_t *a);
+void ff_v_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
+  const uint8_t *a);
 
 #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx) \
 void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t dststride,   \
diff --git a/libavcodec/riscv/vp9dsp_init.c b/libavcodec/riscv/vp9dsp_init.c
index 69ab39004c..9c550d40b5 100644
--- a/libavcodec/riscv/vp9dsp_init.c
+++ b/libavcodec/riscv/vp9dsp_init.c
@@ -36,6 +36,7 @@ static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
 dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
 dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
 dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
+dsp->intra_pred[TX_8X8][VERT_PRED] = ff_v_8x8_rvv;
 }
 
 if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
@@ -51,6 +52,8 @@ static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
 dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
 dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
 dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
+dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvv;
+dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvv;
 }
 #endif
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg
Hi, I initially used a loop, but according to libavcodec/blockdsp.h,

the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
does not require a loop.

```
/* add and put pixel (decoding)
 * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
 * h for op_pixels_func is limited to { width / 2, width },
 * but never larger than 16 and never smaller than 4. */
typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
 uint8_t value, ptrdiff_t line_size, int h);
```

Rémi Denis-Courmont  于2024年4月30日周二 01:31写道:

> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
> >
>
> Are you sure that this works with all vector lengths?
> The block8 code looks odd.
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-29 Thread flow gg
Since there is no 8x16, I changed m8 to m4, and updated it in the reply



flow gg  于2024年4月30日周二 08:26写道:

> Hi, I initially used a loop, but according to libavcodec/blockdsp.h,
>
> the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
> does not require a loop.
>
> ```
> /* add and put pixel (decoding)
>  * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
>  * h for op_pixels_func is limited to { width / 2, width },
>  * but never larger than 16 and never smaller than 4. */
> typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
>  uint8_t value, ptrdiff_t line_size, int h);
> ```
>
> Rémi Denis-Courmont  于2024年4月30日周二 01:31写道:
>
>> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
>> >
>>
>> Are you sure that this works with all vector lengths?
>> The block8 code looks odd.
>>
>> --
>> レミ・デニ-クールモン
>> http://www.remlab.net/
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
From 38068cd4c770b24ac494bddab6c3d19149d2f5cb Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Wed, 6 Mar 2024 12:46:03 +0800
Subject: [PATCH 2/2] lavc/blockdsp: R-V V fill_block

C908:
blockdsp.fill_block_tab[0]_c: 549.7
blockdsp.fill_block_tab[0]_rvv_i64: 48.2
blockdsp.fill_block_tab[1]_c: 77.0
blockdsp.fill_block_tab[1]_rvv_i64: 19.7
---
 libavcodec/riscv/blockdsp_init.c |  6 ++
 libavcodec/riscv/blockdsp_rvv.S  | 21 +
 2 files changed, 27 insertions(+)

diff --git a/libavcodec/riscv/blockdsp_init.c b/libavcodec/riscv/blockdsp_init.c
index 59b2f9d47b..42c8e87fa7 100644
--- a/libavcodec/riscv/blockdsp_init.c
+++ b/libavcodec/riscv/blockdsp_init.c
@@ -27,6 +27,10 @@
 
 void ff_clear_block_rvv(int16_t *block);
 void ff_clear_blocks_rvv(int16_t *block);
+void ff_fill_block16_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
+void ff_fill_block8_rvv(uint8_t *block, uint8_t value, ptrdiff_t line_size,
+   int h);
 
 av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 {
@@ -36,6 +40,8 @@ av_cold void ff_blockdsp_init_riscv(BlockDSPContext *c)
 if (flags & AV_CPU_FLAG_RVV_I64 && ff_get_rv_vlenb() >= 16) {
 c->clear_block = ff_clear_block_rvv;
 c->clear_blocks = ff_clear_blocks_rvv;
+c->fill_block_tab[0] = ff_fill_block16_rvv;
+c->fill_block_tab[1] = ff_fill_block8_rvv;
 }
 #endif
 }
diff --git a/libavcodec/riscv/blockdsp_rvv.S b/libavcodec/riscv/blockdsp_rvv.S
index 8bb00bb467..18ab17da00 100644
--- a/libavcodec/riscv/blockdsp_rvv.S
+++ b/libavcodec/riscv/blockdsp_rvv.S
@@ -40,3 +40,24 @@ func ff_clear_blocks_rvv, zve64x
 
 ret
 endfunc
+
+func ff_fill_block16_rvv, zve32x
+vsetivli  t0, 16, e8, m1, ta, ma
+vmv.v.x   v8, a1
+1:
+addi  a3, a3, -1
+vse8.vv8, (a0)
+add   a0, a0, a2
+bnez  a3, 1b
+
+ret
+endfunc
+
+func ff_fill_block8_rvv, zve64x
+vsetvli   t0, zero, e8, m4, ta, ma
+vmv.v.x   v8, a1
+vsetvli   t0, a3, e64, m4, ta, ma
+vsse64.v  v8, (a0), a2
+
+ret
+endfunc
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 1/2] checkasm/blockdsp: add fill_block test

2024-04-29 Thread flow gg
Since there is no 8x16, not test 8x16, and updated it in the reply

flow gg  于2024年4月29日周一 15:09写道:

>
>
From fc7c28cb78e0c90880f31c0b8d6f2fc16d0fe581 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Mon, 29 Apr 2024 14:18:23 +0800
Subject: [PATCH 1/2] checkasm/blockdsp: add fill_block test

---
 tests/checkasm/blockdsp.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/tests/checkasm/blockdsp.c b/tests/checkasm/blockdsp.c
index 22a2f79455..19d69b8687 100644
--- a/tests/checkasm/blockdsp.c
+++ b/tests/checkasm/blockdsp.c
@@ -29,6 +29,11 @@
 #include "libavutil/intreadwrite.h"
 #include "libavutil/mem_internal.h"
 
+typedef struct {
+const char *name;
+int size;
+} test;
+
 #define randomize_buffers(size) \
 do {\
 int i;  \
@@ -52,6 +57,30 @@ do {\
 }   \
 } while (0)
 
+static void check_fill(BlockDSPContext *h){
+const test tests[] = {
+{"fill_block_tab[0]", 16},
+{"fill_block_tab[1]", 8},
+};
+LOCAL_ALIGNED_32(uint8_t, buf0, [16 * 16]);
+LOCAL_ALIGNED_32(uint8_t, buf1, [16 * 16]);
+
+for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
+int n = tests[t].size;
+declare_func(void, uint8_t *block, uint8_t value,
+ ptrdiff_t line_size, int h);
+if (check_func(h->fill_block_tab[t], "blockdsp.%s", tests[t].name)) {
+uint8_t value = rnd();
+randomize_buffers(tests[t].size);
+call_ref(buf0, value, n, n);
+call_new(buf1, value, n, n);
+if (memcmp(buf0, buf1, sizeof(*buf0) * n * n))
+fail();
+bench_new(buf0, value, n, n);
+}
+}
+}
+
 void checkasm_check_blockdsp(void)
 {
 LOCAL_ALIGNED_32(uint16_t, buf0, [6 * 8 * 8]);
@@ -64,5 +93,7 @@ void checkasm_check_blockdsp(void)
 check_clear(clear_block,  8 * 8);
 check_clear(clear_blocks, 8 * 8 * 6);
 
+check_fill(&h);
+
 report("blockdsp");
 }
-- 
2.44.0

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH 2/2] lavc/blockdsp: R-V V fill_block

2024-04-30 Thread flow gg
Since the number of stores is controlled by a3 and not by zero, it doesn't
have to be exactly 16 bytes ?

Rémi Denis-Courmont  于2024年4月30日周二 14:40写道:

>
>
> Le 30 avril 2024 03:26:25 GMT+03:00, flow gg  a
> écrit :
> >Hi, I initially used a loop, but according to libavcodec/blockdsp.h,
> >
> >the maximum is 8x16 = 128 bytes, so using ff_get_rv_vlenb() >= 16 and m8
> >does not require a loop.
>
> It's okay to assume that VLENB is at least 16 bytes (as long as it's
> checked), but the code seems to assume (?) that it's *exactly* 16 bytes,
> which will break on future hardware.
>
> >
> >```
> >/* add and put pixel (decoding)
> > * Block sizes for op_pixels_func are 8x4,8x8 16x8 16x16.
> > * h for op_pixels_func is limited to { width / 2, width },
> > * but never larger than 16 and never smaller than 4. */
> >typedef void (*op_fill_func)(uint8_t *block /* align width (8 or 16) */,
> > uint8_t value, ptrdiff_t line_size, int h);
> >```
> >
> >Rémi Denis-Courmont  于2024年4月30日周二 01:31写道:
> >
> >> Le maanantaina 29. huhtikuuta 2024, 10.09.41 EEST flow gg a écrit :
> >> >
> >>
> >> Are you sure that this works with all vector lengths?
> >> The block8 code looks odd.
> >>
> >> --
> >> レミ・デニ-クールモン
> >> http://www.remlab.net/
> >> ___
> >> ffmpeg-devel mailing list
> >> ffmpeg-devel@ffmpeg.org
> >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >>
> >> To unsubscribe, visit link above, or email
> >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> >>
> >___
> >ffmpeg-devel mailing list
> >ffmpeg-devel@ffmpeg.org
> >https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> >To unsubscribe, visit link above, or email
> >ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH 1/2] checkasm/rv40dsp: add chroma_mc test

2024-04-30 Thread flow gg

From 07c0b8a26b76e31c46ecabddb251f317c48c73a3 Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 30 Apr 2024 12:43:57 +0800
Subject: [PATCH 1/2] checkasm/rv40dsp: add chroma_mc test

This is similar to h264.
---
 tests/checkasm/Makefile   |  1 +
 tests/checkasm/checkasm.c |  3 ++
 tests/checkasm/checkasm.h |  1 +
 tests/checkasm/rv40dsp.c  | 75 +++
 tests/fate/checkasm.mak   |  1 +
 5 files changed, 81 insertions(+)
 create mode 100644 tests/checkasm/rv40dsp.c

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index d846a48585..559d88cba4 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -35,6 +35,7 @@ AVCODECOBJS-$(CONFIG_OPUS_DECODER)  += opusdsp.o
 AVCODECOBJS-$(CONFIG_PIXBLOCKDSP)   += pixblockdsp.o
 AVCODECOBJS-$(CONFIG_HEVC_DECODER)  += hevc_add_res.o hevc_deblock.o hevc_idct.o hevc_sao.o hevc_pel.o
 AVCODECOBJS-$(CONFIG_RV34DSP)   += rv34dsp.o
+AVCODECOBJS-$(CONFIG_RV40_DECODER)  += rv40dsp.o
 AVCODECOBJS-$(CONFIG_SVQ1_ENCODER)  += svq1enc.o
 AVCODECOBJS-$(CONFIG_TAK_DECODER)   += takdsp.o
 AVCODECOBJS-$(CONFIG_UTVIDEO_DECODER)   += utvideodsp.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index ffc04f0623..e007cd59a5 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -170,6 +170,9 @@ static const struct {
 #if CONFIG_RV34DSP
 { "rv34dsp", checkasm_check_rv34dsp },
 #endif
+#if CONFIG_RV40_DECODER
+{ "rv40dsp", checkasm_check_rv40dsp },
+#endif
 #if CONFIG_SVQ1_ENCODER
 { "svq1enc", checkasm_check_svq1enc },
 #endif
diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
index 1f31591ac0..3dadbb00ad 100644
--- a/tests/checkasm/checkasm.h
+++ b/tests/checkasm/checkasm.h
@@ -113,6 +113,7 @@ void checkasm_check_opusdsp(void);
 void checkasm_check_pixblockdsp(void);
 void checkasm_check_sbrdsp(void);
 void checkasm_check_rv34dsp(void);
+void checkasm_check_rv40dsp(void);
 void checkasm_check_svq1enc(void);
 void checkasm_check_synth_filter(void);
 void checkasm_check_sw_gbrp(void);
diff --git a/tests/checkasm/rv40dsp.c b/tests/checkasm/rv40dsp.c
new file mode 100644
index 00..a1a873d430
--- /dev/null
+++ b/tests/checkasm/rv40dsp.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include 
+#include 
+#include "checkasm.h"
+#include "libavcodec/rv40dsp.c"
+#include "libavutil/mem_internal.h"
+
+#define randomize_buffers()  \
+do { \
+for (int i = 0; i < 16*18*2; i++)\
+src[i] = rnd() & 0x3;\
+} while (0)
+
+static void check_chroma_mc(void)
+{
+RV34DSPContext h;
+LOCAL_ALIGNED_32(uint8_t, src,  [16 * 18 * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst0, [16 * 18 * 2]);
+LOCAL_ALIGNED_32(uint8_t, dst1, [16 * 18 * 2]);
+
+declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *dst, const uint8_t *src,
+  ptrdiff_t stride, int h, int x, int y);
+
+ff_rv40dsp_init(&h);
+randomize_buffers();
+for (int size = 0; size < 2; size++) {
+
+#define CHECK_CHROMA_MC(name) \
+do {  \
+if (check_func(h.name## _pixels_tab[size], #name "_mc%d", 1 << (3 - size))) { \
+for (int x = 0; x < 2; x++) { \
+for (int y = 0; y < 2; y++) { \
+memcpy(dst0, src, 16 * 18);   \
+memcpy(dst1, src, 16 * 18);   \
+call_ref(dst0, src, 16, 16, x, y);\
+call_new(dst1, src, 16, 16, x, y);\
+if (memcmp(dst0, dst1, 16 * 16)) {\
+fprintf(stderr, #na

[FFmpeg-devel] [PATCH 2/2] lavc/rv40dsp: R-V V chroma_mc

2024-04-30 Thread flow gg

From 3e66b2bbe257cc91a4c2169362163e92aba6760b Mon Sep 17 00:00:00 2001
From: sunyuechi 
Date: Tue, 30 Apr 2024 18:24:00 +0800
Subject: [PATCH 2/2] lavc/rv40dsp: R-V V chroma_mc

This is similar to h264, but here we use manual_avg instead of vaaddu
because rv40's OP differs from h264. If we use vaaddu,
rv40 would need to repeatedly switch between vxrm=0 and vxrm=2,
and switching vxrm is very slow.

C908:
avg_chroma_mc4_c: 2330.0
avg_chroma_mc4_rvv_i32: 602.7
avg_chroma_mc8_c: 1211.0
avg_chroma_mc8_rvv_i32: 602.7
put_chroma_mc4_c: 1825.0
put_chroma_mc4_rvv_i32: 414.7
put_chroma_mc8_c: 932.0
put_chroma_mc8_rvv_i32: 414.7
---
 libavcodec/riscv/Makefile   |   2 +
 libavcodec/riscv/rv40dsp_init.c |  51 +
 libavcodec/riscv/rv40dsp_rvv.S  | 371 
 libavcodec/rv34dsp.h|   1 +
 libavcodec/rv40dsp.c|   2 +
 5 files changed, 427 insertions(+)
 create mode 100644 libavcodec/riscv/rv40dsp_init.c
 create mode 100644 libavcodec/riscv/rv40dsp_rvv.S

diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index dce1236b84..43b5c21cf4 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -50,6 +50,8 @@ RV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvi.o
 RVV-OBJS-$(CONFIG_PIXBLOCKDSP) += riscv/pixblockdsp_rvv.o
 OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_init.o
 RVV-OBJS-$(CONFIG_RV34DSP) += riscv/rv34dsp_rvv.o
+OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_init.o
+RVV-OBJS-$(CONFIG_RV40_DECODER) += riscv/rv40dsp_rvv.o
 OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_init.o
 RVV-OBJS-$(CONFIG_SVQ1_ENCODER) += riscv/svqenc_rvv.o
 OBJS-$(CONFIG_TAK_DECODER) += riscv/takdsp_init.o
diff --git a/libavcodec/riscv/rv40dsp_init.c b/libavcodec/riscv/rv40dsp_init.c
new file mode 100644
index 00..f5a5510b28
--- /dev/null
+++ b/libavcodec/riscv/rv40dsp_init.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "config.h"
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/riscv/cpu.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_put_rv40_chroma_mc4_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+void ff_avg_rv40_chroma_mc8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+void ff_avg_rv40_chroma_mc4_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,
+ int h, int x, int y);
+
+av_cold void ff_rv40dsp_init_riscv(RV34DSPContext *c)
+{
+#if HAVE_RVV
+int flags = av_get_cpu_flags();
+
+if ((flags & AV_CPU_FLAG_RVV_I32) && ff_get_rv_vlenb() >= 16 &&
+(flags & AV_CPU_FLAG_RVB_ADDR)) {
+c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_rvv;
+c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_rvv;
+c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_rvv;
+c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_rvv;
+}
+#endif
+}
diff --git a/libavcodec/riscv/rv40dsp_rvv.S b/libavcodec/riscv/rv40dsp_rvv.S
new file mode 100644
index 00..e49345ef70
--- /dev/null
+++ b/libavcodec/riscv/rv40dsp_rvv.S
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences (ISCAS).
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth F

Re: [FFmpeg-devel] [PATCH 2/4] lavc/vp9dsp: R-V V ipred vert

2024-05-02 Thread flow gg
Sorry, this is because a 'bpp == 8' was missed. It has been fixed in this
link

Rémi Denis-Courmont  于2024年5月2日周四 22:11写道:

> Le tiistaina 30. huhtikuuta 2024, 2.36.22 EEST flow gg a écrit :
> > updated it in the reply and https://github.com/hleft/FFmpeg/tree/vp8vp9
>
> VP9 checkasm does not pass on that branch.
>
> > Rémi Denis-Courmont  于2024年4月30日周二 01:57写道:
> >
> > > Le perjantaina 22. maaliskuuta 2024, 8.02.38 EEST flow gg a écrit :
> > > > Because the previous patch was updated, so it was updated in this
> > >
> > > response
> > >
> > > Seemingly needs rebase since April 7.
> > >
> > > --
> > > レミ・デニ-クールモン
> > > http://www.remlab.net/
> > >
> > >
> > >
> > > ___
> > > ffmpeg-devel mailing list
> > > ffmpeg-devel@ffmpeg.org
> > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> > >
> > > To unsubscribe, visit link above, or email
> > > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
>
> --
> レミ・デニ-クールモン
> http://www.remlab.net/
>
>
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [RFC] 5 year plan & Inovation

2024-05-03 Thread flow gg
I saw about comparing emails and gitlab/hub .., I did not comprehensively
understand their advantages and disadvantages, but I want to say that I
support it to change to gitlab/hub

Simple reason:

If you need to use git-send-email, I may not be able to submit any code
If you do not need to use git-send-email, it is troublesome for the
reviewer and the contributor

In detail:

I have tried git-send-email, but it failed. You can say that I am stupid,
but I would say that this is because of various reasons such as my area and
the network. It is really not what I can solve.
Maybe I will spend a lot of energy trying it in the future, but this is
because I have submitted thousands of lines of code. I don't want to give
up. If it is from the beginning, it will cause abandonment.

Maybe I am younger here in FFMPEG. I have a lot of good young people around
me. They all use github/lab by default, and there will be the same problem
as me, resulting in abandonment.

I don't really care about the quality between these tools. I think people
are important. I only want to use it, and I can facilitate the real
reviewer of Review.

I don't know if I can say my personal feelings here, but I will say:

I feel despised by this passage, which makes me uncomfortable. If you are a
reviewer, maybe I have no chance to contribute, but anyway, I have made
some contributions.

> How can anyne use git, but not git send-email? Any develop email provider
HAS Support for External Clients Over SMTP. And I Believe You * Can *
Actually
Dictate that people doon't attach patches - if you have control over the
Mailing list software, you can set up a filter that rejects such emails
And auto-replies with instructions on how to send them properly.

I think I should have the right to contribute

Ondřej Fiala  于2024年5月2日周四 22:25写道:

> On Wed May 1, 2024 at 7:27 AM CEST, Rémi Denis-Courmont wrote:
> > Le 30 avril 2024 22:15:10 GMT+03:00, "Ondřej Fiala" 
> a écrit :
> > >On Tue Apr 30, 2024 at 9:06 PM CEST, Hendrik Leppkes wrote:
> > >> I will take the replacement instead, thanks. Email is archaic. The
> > >> entire point is to get away from email, not dress it up.
> > >> SourceHut usage would likely make me even less interested then today.
> > >>
> > >> - Hendrik
> > >I guess that depends on how (and with what) you use it. Using it with
> > >Gmail UI for example is obviously not a great idea. No idea whether you
> > >do, but if you do, you should be upset at Gmail, not email.
> >
> > I don't use Gmail, and using email for review still sucks. No matter how
> you
> > slice it, email was not meant for threaded code reviews.
> Email was not meant for a lot of what it's used for today. Many email
> clients
> have support for threading, and unlike GitHub allow threads of arbitrary
> depth. Using such a client with commands for moving between messages in a
> a thread etc. makes threaded code review over email quite usably in my
> opinion.
>
> > Also while I can use git-send-email, not everyone can. And patches as
> > attachments are simply awful. Unfortunately I can't dictate that people
> don't
> > send patches that way.
> How can anyone use git, but not git send-email? Any decent email provider
> has support for external clients over SMTP. And I believe you *can*
> actually
> dictate that people don't attach patches -- if you have control over the
> mailing list software, you can set up a filter that rejects such emails
> and auto-replies with instructions on how to send them properly.
>
> > >But you did not answer my question: which specific code review features
> > >are you missing?
> >
> > Proper threaded reviews with state tracking, ability to collapse and
> expand
> > context and files, and proper listing of open MR (*not* like patchwork).
> I can sort of understand everything except the last one. What is "a proper
> listing of open MR" supposed to mean...? (I know what a merge request is,
> of course, but I don't get how the way GitLab lists them is supposedly
> superior to SourceHut's list of patches.)
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-04 Thread flow gg
Hi, it's me. I accidentally repeated it but it seems to be correct.

 于2024年5月4日周六 18:01写道:

> From: sunyuechi 
>
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> ---
>  libavcodec/riscv/vc1dsp_init.c |  8 +
>  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++
>  2 files changed, 74 insertions(+)
>
> diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> index e47b644f80..610c43a1a3 100644
> --- a/libavcodec/riscv/vc1dsp_init.c
> +++ b/libavcodec/riscv/vc1dsp_init.c
> @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t stride, int16_t *block
>  void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_8x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
>  void ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> *block);
> +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_put_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
> +void ff_avg_pixels8x8_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> line_size, int rnd);
>
>  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  {
> @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
>  if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
>  dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
>  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> +dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> +dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
>  if (flags & AV_CPU_FLAG_RVV_I64) {
>  dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
>  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> +dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> +dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
>  }
>  }
>  #endif
> diff --git a/libavcodec/riscv/vc1dsp_rvv.S b/libavcodec/riscv/vc1dsp_rvv.S
> index 4a00945ead..48244f91aa 100644
> --- a/libavcodec/riscv/vc1dsp_rvv.S
> +++ b/libavcodec/riscv/vc1dsp_rvv.S
> @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
>  vsse32.v  v0, (a0), a1
>  ret
>  endfunc
> +
> +func ff_put_pixels16x16_rvv, zve32x
> +vsetivli  zero, 16, e8, m1, ta, ma
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vle8.vv\n, (a1)
> +add   a1, a1, a2
> +.endr
> +vle8.vv31, (a1)
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vse8.vv\n, (a0)
> +add   a0, a0, a2
> +.endr
> +vse8.vv31, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_put_pixels8x8_rvv, zve64x
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v8, (a1), a2
> +vsse64.v  v8, (a0), a2
> +
> +ret
> +endfunc
> +
> +func ff_avg_pixels16x16_rvv, zve32x
> +csrwi vxrm, 0
> +vsetivli  zero, 16, e8, m1, ta, ma
> +lit0, 128
> +
> +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
> +vle8.vv\n, (a1)
> +add   a1, a1, a2
> +.endr
> +vle8.vv31, (a1)
> +.irp n 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
> +vle8.vv\n, (a0)
> +add   a0, a0, a2
> +.endr
> +vle8.vv15, (a0)
> +vsetvli   zero, t0, e8, m8, ta, ma
> +vaaddu.vv v0, v0, v16
> +vaaddu.vv v8, v8, v24
> +vsetivli  zero, 16, e8, m1, ta, ma
> +.irp n  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
> +vse8.vv\n, (a0)
> +sub   a0, a0, a2
> +.endr
> +vse8.vv0, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_avg_pixels8x8_rvv, zve64x
> +csrwi vxrm, 0
> +lit0, 64
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vlse64.v  v16, (a1), a2
> +vlse64.v  v8, (a0), a2
> +vsetvli   zero, t0, e8, m4, ta, ma
> +vaaddu.vv v16, v16, v8
> +vsetivli  zero, 8, e8, mf2, ta, ma
> +vsse64.v  v16, (a0), a2
> +
> +ret
> +endfunc
> --
> 2.45.0
>
> ___

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp8dsp: R-V V put_vp8_pixels

2024-05-04 Thread flow gg
I've reorganized it, and the github link is at :
https://github.com/hleft/FFmpeg/tree/vp8

 于2024年5月4日周六 22:49写道:

> From: sunyuechi 
>
> C908:
> vp8_put_pixels4_c: 87.5
> vp8_put_pixels4_rvv_i32: 42.7
> vp8_put_pixels8_c: 284.5
> vp8_put_pixels8_rvv_i32: 77.7
> vp8_put_pixels16_c: 1087.7
> vp8_put_pixels16_rvv_i32: 108.0
> ---
>  libavcodec/riscv/vp8dsp.h  | 75 ++
>  libavcodec/riscv/vp8dsp_init.c | 22 ++
>  libavcodec/riscv/vp8dsp_rvv.S  | 27 
>  libavcodec/vp8dsp.c|  2 +
>  libavcodec/vp8dsp.h|  1 +
>  5 files changed, 127 insertions(+)
>  create mode 100644 libavcodec/riscv/vp8dsp.h
>
> diff --git a/libavcodec/riscv/vp8dsp.h b/libavcodec/riscv/vp8dsp.h
> new file mode 100644
> index 00..971c5c0a96
> --- /dev/null
> +++ b/libavcodec/riscv/vp8dsp.h
> @@ -0,0 +1,75 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_VP8DSP_H
> +#define AVCODEC_RISCV_VP8DSP_H
> +
> +#include "libavcodec/vp8dsp.h"
> +
> +#define VP8_LF_Y(hv, inner, opt)
>\
> +void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,
> \
> +ptrdiff_t stride,
> \
> +int flim_E, int
> flim_I,  \
> +int hev_thresh)
> +
> +#define VP8_LF_UV(hv, inner, opt)
> \
> +void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,
> \
> + uint8_t *dstV,
> \
> + ptrdiff_t stride,
>\
> + int flim_E, int
> flim_I, \
> + int hev_thresh)
> +
> +#define VP8_LF_SIMPLE(hv, opt)  \
> +void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
> +  ptrdiff_t stride, \
> +  int flim)
> +
> +#define VP8_LF_HV(inner, opt)   \
> +VP8_LF_Y(h,  inner, opt);   \
> +VP8_LF_Y(v,  inner, opt);   \
> +VP8_LF_UV(h, inner, opt);   \
> +VP8_LF_UV(v, inner, opt)
> +
> +#define VP8_LF(opt) \
> +VP8_LF_HV(,   opt); \
> +VP8_LF_HV(_inner, opt); \
> +VP8_LF_SIMPLE(h, opt);  \
> +VP8_LF_SIMPLE(v, opt)
> +
> +#define VP8_MC(n, opt)  \
> +void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,  \
> +const uint8_t *src, ptrdiff_t srcstride,\
> +int h, int x, int y)
> +
> +#define VP8_EPEL(w, opt)\
> +VP8_MC(pixels ## w, opt);   \
> +VP8_MC(epel ## w ## _h4, opt);  \
> +VP8_MC(epel ## w ## _h6, opt);  \
> +VP8_MC(epel ## w ## _v4, opt);  \
> +VP8_MC(epel ## w ## _h4v4, opt);\
> +VP8_MC(epel ## w ## _h6v4, opt);\
> +VP8_MC(epel ## w ## _v6, opt);  \
> +VP8_MC(epel ## w ## _h4v6, opt);\
> +VP8_MC(epel ## w ## _h6v6, opt)
> +
> +#define VP8_BILIN(w, opt)   \
> +VP8_MC(bilin ## w ## _h, opt);  \
> +VP8_MC(bilin ## w ## _v, opt);  \
> +VP8_MC(bilin ## w ## _hv, opt)
> +
> +#endif /* AVCODEC_RISCV_VP8DSP_H */
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index af57aabb71..c364de3dc9 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -24,11 +24,33 @@
>  #include "libavutil/cpu.h"
>  #include "libavutil/riscv/cpu.h"
>  #include "libavcodec/vp8dsp.h"
> +#include "vp8dsp.h"
>
>  void ff_vp8_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t
> stride);
>  void ff_vp8_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16],
> ptrdiff_t stride);
>  void ff_vp8_idct_dc_add4uv_rvv(ui

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp9dsp: R-V V ipred vert

2024-05-04 Thread flow gg
the github link: https://github.com/hleft/FFmpeg/tree/vp9

 于2024年5月4日周六 23:03写道:

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvv_i64: 18.5
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvv_i32: 50.7
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvv_i32: 136.7
> ---
>  libavcodec/riscv/vp9_intra_rvv.S | 35 
>  libavcodec/riscv/vp9dsp.h|  6 ++
>  libavcodec/riscv/vp9dsp_init.c   |  3 +++
>  3 files changed, 44 insertions(+)
>
> diff --git a/libavcodec/riscv/vp9_intra_rvv.S
> b/libavcodec/riscv/vp9_intra_rvv.S
> index db9774c263..b5f0f9d3c3 100644
> --- a/libavcodec/riscv/vp9_intra_rvv.S
> +++ b/libavcodec/riscv/vp9_intra_rvv.S
> @@ -113,3 +113,38 @@ func_dc dc_left  8   left 3  0  zve64x
>  func_dc dc_top   32  top  5  1  zve32x
>  func_dc dc_top   16  top  4  1  zve32x
>  func_dc dc_top   8   top  3  0  zve64x
> +
> +func ff_v_32x32_rvv, zve32x
> +vsetivli zero, 8, e8, mf2, ta, ma
> +vle32.v  v8, (a3)
> +
> +.rept 31
> +vse32.v  v8, (a0)
> +add  a0, a0, a1
> +.endr
> +vse32.v  v8, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvv, zve32x
> +vsetivli zero, 4, e8, mf4, ta, ma
> +vle32.v  v8, (a3)
> +
> +.rept 15
> +vse32.v  v8, (a0)
> +add  a0, a0, a1
> +.endr
> +vse32.v  v8, (a0)
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvv, zve64x
> +ld   t0, (a3)
> +vsetivli zero, 8, e64, m4, ta, ma
> +vmv.v.x  v8, t0
> +vsse64.v v8, (a0), a1
> +
> +ret
> +endfunc
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..113397ce86 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..9c550d40b5 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -36,6 +36,7 @@ static av_cold void
> vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
>  dsp->intra_pred[TX_8X8][DC_128_PRED] = ff_dc_128_8x8_rvv;
>  dsp->intra_pred[TX_8X8][DC_129_PRED] = ff_dc_129_8x8_rvv;
>  dsp->intra_pred[TX_8X8][TOP_DC_PRED] = ff_dc_top_8x8_rvv;
> +dsp->intra_pred[TX_8X8][VERT_PRED] = ff_v_8x8_rvv;
>  }
>
>  if (bpp == 8 && flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb()
> >= 16) {
> @@ -51,6 +52,8 @@ static av_cold void
> vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
>  dsp->intra_pred[TX_16X16][DC_129_PRED] = ff_dc_129_16x16_rvv;
>  dsp->intra_pred[TX_32X32][TOP_DC_PRED] = ff_dc_top_32x32_rvv;
>  dsp->intra_pred[TX_16X16][TOP_DC_PRED] = ff_dc_top_16x16_rvv;
> +dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvv;
> +dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvv;
>  }
>  #endif
>  }
> --
> 2.45.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


Re: [FFmpeg-devel] [PATCH] lavc/vc1dsp: R-V V mspel_pixels

2024-05-05 Thread flow gg
> Is it not faster to compute the address ahead of time, e.g.:
> Ditto below and in other patches.

Yes, update here and I will check other patches

> Copying 64-bit quantities should not need RVV at all. Maybe the C version
needs to be improved instead, but if that is not possible, then an RVI
version
may be more portable and work just as well.

The logic in the c version is the same in other places, which might be
difficult to modify. I've updated it using rvi.

> Does MF2 actually improve perfs over M1 here?

The difference here seems very small, but when both mf2 and m1 are correct,
the test results have only shown mf2 to be better, so I want to use mf2.

Rémi Denis-Courmont  于2024年5月5日周日 01:53写道:

> Le lauantaina 4. toukokuuta 2024, 13.01.05 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_c: 869.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[0][0]_rvv_i32: 148.7
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_c: 220.5
> > vc1dsp.avg_vc1_mspel_pixels_tab[1][0]_rvv_i64: 56.2
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_c: 523.7
> > vc1dsp.put_vc1_mspel_pixels_tab[0][0]_rvv_i32: 82.0
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_c: 138.5
> > vc1dsp.put_vc1_mspel_pixels_tab[1][0]_rvv_i64: 23.7
> > ---
> >  libavcodec/riscv/vc1dsp_init.c |  8 +
> >  libavcodec/riscv/vc1dsp_rvv.S  | 66 ++
> >  2 files changed, 74 insertions(+)
> >
> > diff --git a/libavcodec/riscv/vc1dsp_init.c
> b/libavcodec/riscv/vc1dsp_init.c
> > index e47b644f80..610c43a1a3 100644
> > --- a/libavcodec/riscv/vc1dsp_init.c
> > +++ b/libavcodec/riscv/vc1dsp_init.c
> > @@ -29,6 +29,10 @@ void ff_vc1_inv_trans_8x8_dc_rvv(uint8_t *dest,
> ptrdiff_t
> > stride, int16_t *block void ff_vc1_inv_trans_4x8_dc_rvv(uint8_t *dest,
> > ptrdiff_t stride, int16_t *block); void
> ff_vc1_inv_trans_8x4_dc_rvv(uint8_t
> > *dest, ptrdiff_t stride, int16_t *block); void
> > ff_vc1_inv_trans_4x4_dc_rvv(uint8_t *dest, ptrdiff_t stride, int16_t
> > *block); +void ff_put_pixels16x16_rvv(uint8_t *dst, const uint8_t *src,
> > ptrdiff_t line_size, int rnd); +void ff_put_pixels8x8_rvv(uint8_t *dst,
> > const uint8_t *src, ptrdiff_t line_size, int rnd); +void
> > ff_avg_pixels16x16_rvv(uint8_t *dst, const uint8_t *src, ptrdiff_t
> > line_size, int rnd); +void ff_avg_pixels8x8_rvv(uint8_t *dst, const
> uint8_t
> > *src, ptrdiff_t line_size, int rnd);
> >
> >  av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  {
> > @@ -38,9 +42,13 @@ av_cold void ff_vc1dsp_init_riscv(VC1DSPContext *dsp)
> >  if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> >  dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_rvv;
> >  dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_rvv;
> > +dsp->put_vc1_mspel_pixels_tab[0][0] = ff_put_pixels16x16_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[0][0] = ff_avg_pixels16x16_rvv;
> >  if (flags & AV_CPU_FLAG_RVV_I64) {
> >  dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_rvv;
> >  dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_rvv;
> > +dsp->put_vc1_mspel_pixels_tab[1][0] = ff_put_pixels8x8_rvv;
> > +dsp->avg_vc1_mspel_pixels_tab[1][0] = ff_avg_pixels8x8_rvv;
> >  }
> >  }
> >  #endif
> > diff --git a/libavcodec/riscv/vc1dsp_rvv.S
> b/libavcodec/riscv/vc1dsp_rvv.S
> > index 4a00945ead..48244f91aa 100644
> > --- a/libavcodec/riscv/vc1dsp_rvv.S
> > +++ b/libavcodec/riscv/vc1dsp_rvv.S
> > @@ -111,3 +111,69 @@ func ff_vc1_inv_trans_4x4_dc_rvv, zve32x
> >  vsse32.v  v0, (a0), a1
> >  ret
> >  endfunc
> > +
> > +func ff_put_pixels16x16_rvv, zve32x
> > +vsetivli  zero, 16, e8, m1, ta, ma
> > +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +vle8.vv\n, (a1)
> > +add   a1, a1, a2
> > +.endr
> > +vle8.vv31, (a1)
>
> Is it not faster to compute the address ahead of time, e.g.:
>
> add t1, a2, a1
> vle8.v vN, (a1)
> sh1add a1, a2, a1
> vle8.v vN+1, (t1)
>
> ...and so on? Even on a reordering core, you can't eliminate stall on data
> dependency if there is nothing else to be done.
>
> (Ditto below and in other patches.)
>
> > +.irp n 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
> 30
> > +vse8.vv\n, (a0)
> > +add   a0, a0, a2
> > +.endr
> > +vse8.vv31, (a0)
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_put_pixels8x8_rvv, zve64x
> > +vsetivli  zero, 8, e8, mf2, ta, ma
> > +vlse64.v  v8, (a1), a2
> > +vsse64.v  v8, (a0), a2
>
> Copying 64-bit quantities should not need RVV at all. Maybe the C version
> needs to be improved instead, but if that is not possible, then an RVI
> version
> may be more portable and work just as well.
>
> > +
> > +ret
> > +endfunc
> > +
> > +func ff_avg_pixels16x16_rvv, zve32x
> > +csrwi vxrm,

Re: [FFmpeg-devel] [PATCH 01/10] lavc/vp8dsp: R-V put_vp8_pixels

2024-05-05 Thread flow gg
Made these changes according to the previous review:
moved func into macro, added macro vset to reduce if else, used rvi,
supplemented __riscv_xlen

 于2024年5月6日周一 00:45写道:

> From: sunyuechi 
>
> C908:
> vp8_put_pixels4_c: 78.0
> vp8_put_pixels4_rvi: 33.7
> vp8_put_pixels8_c: 278.0
> vp8_put_pixels8_rvi: 55.0
> vp8_put_pixels16_c: 999.0
> vp8_put_pixels16_rvi: 86.7
> ---
>  libavcodec/riscv/Makefile  |  1 +
>  libavcodec/riscv/vp8dsp.h  | 75 ++
>  libavcodec/riscv/vp8dsp_init.c | 22 ++
>  libavcodec/riscv/vp8dsp_rvi.S  | 61 +++
>  libavcodec/vp8dsp.c|  2 +
>  libavcodec/vp8dsp.h|  1 +
>  6 files changed, 162 insertions(+)
>  create mode 100644 libavcodec/riscv/vp8dsp.h
>  create mode 100644 libavcodec/riscv/vp8dsp_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 050c08ee61..526cb5c97c 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -61,6 +61,7 @@ RVV-OBJS-$(CONFIG_UTVIDEO_DECODER) +=
> riscv/utvideodsp_rvv.o
>  OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_init.o
>  RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
>  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
> +RV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvi.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
> diff --git a/libavcodec/riscv/vp8dsp.h b/libavcodec/riscv/vp8dsp.h
> new file mode 100644
> index 00..971c5c0a96
> --- /dev/null
> +++ b/libavcodec/riscv/vp8dsp.h
> @@ -0,0 +1,75 @@
> +/*
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#ifndef AVCODEC_RISCV_VP8DSP_H
> +#define AVCODEC_RISCV_VP8DSP_H
> +
> +#include "libavcodec/vp8dsp.h"
> +
> +#define VP8_LF_Y(hv, inner, opt)
>\
> +void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst,
> \
> +ptrdiff_t stride,
> \
> +int flim_E, int
> flim_I,  \
> +int hev_thresh)
> +
> +#define VP8_LF_UV(hv, inner, opt)
> \
> +void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU,
> \
> + uint8_t *dstV,
> \
> + ptrdiff_t stride,
>\
> + int flim_E, int
> flim_I, \
> + int hev_thresh)
> +
> +#define VP8_LF_SIMPLE(hv, opt)  \
> +void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \
> +  ptrdiff_t stride, \
> +  int flim)
> +
> +#define VP8_LF_HV(inner, opt)   \
> +VP8_LF_Y(h,  inner, opt);   \
> +VP8_LF_Y(v,  inner, opt);   \
> +VP8_LF_UV(h, inner, opt);   \
> +VP8_LF_UV(v, inner, opt)
> +
> +#define VP8_LF(opt) \
> +VP8_LF_HV(,   opt); \
> +VP8_LF_HV(_inner, opt); \
> +VP8_LF_SIMPLE(h, opt);  \
> +VP8_LF_SIMPLE(v, opt)
> +
> +#define VP8_MC(n, opt)  \
> +void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride,  \
> +const uint8_t *src, ptrdiff_t srcstride,\
> +int h, int x, int y)
> +
> +#define VP8_EPEL(w, opt)\
> +VP8_MC(pixels ## w, opt);   \
> +VP8_MC(epel ## w ## _h4, opt);  \
> +VP8_MC(epel ## w ## _h6, opt);  \
> +VP8_MC(epel ## w ## _v4, opt);  \
> +VP8_MC(epel ## w ## _h4v4, opt);\
> +VP8_MC(epel ## w ## _h6v4, opt);\
> +VP8_MC(epel ## w ## _v6, opt);  \
> +VP8_MC(epel ## w ## _h4v6, opt);\
> +VP8_MC(epel ## w ## _h6v6, opt)
> +
> +#define VP8_BILIN(w, o

Re: [FFmpeg-devel] [PATCH v3 2/9] lavc/vp8dsp: R-V V put_bilin_h v

2024-05-05 Thread flow gg
> Doesn't this effectively discard the last element, t5?
> Can't we skip the slide and just load the vector at a2+1? Also then, we
can
> keep VL=len and halve the multipler.

Yes, this is better, I remember that using slide1down was better in the
initial version testing, but now it has changed..
I modified it to load a2+1 and merged h and v.

 于2024年5月6日周一 11:38写道:

> From: sunyuechi 
>
> C908:
> vp8_put_bilin4_h_c: 367.0
> vp8_put_bilin4_h_rvv_i32: 137.7
> vp8_put_bilin4_v_c: 377.0
> vp8_put_bilin4_v_rvv_i32: 137.7
> vp8_put_bilin8_h_c: 1431.0
> vp8_put_bilin8_h_rvv_i32: 297.5
> vp8_put_bilin8_v_c: 1449.0
> vp8_put_bilin8_v_rvv_i32: 297.5
> vp8_put_bilin16_h_c: 2839.0
> vp8_put_bilin16_h_rvv_i32: 344.7
> vp8_put_bilin16_v_c: 2857.0
> vp8_put_bilin16_v_rvv_i32: 344.7
> ---
>  libavcodec/riscv/vp8dsp_init.c | 21 +++
>  libavcodec/riscv/vp8dsp_rvv.S  | 49 ++
>  2 files changed, 70 insertions(+)
>
> diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> index fa3feeacf7..afffa6de2f 100644
> --- a/libavcodec/riscv/vp8dsp_init.c
> +++ b/libavcodec/riscv/vp8dsp_init.c
> @@ -34,6 +34,10 @@ VP8_EPEL(16, rvi);
>  VP8_EPEL(8,  rvi);
>  VP8_EPEL(4,  rvi);
>
> +VP8_BILIN(16, rvv);
> +VP8_BILIN(8,  rvv);
> +VP8_BILIN(4,  rvv);
> +
>  av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  {
>  #if HAVE_RV
> @@ -48,6 +52,23 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
>  c->put_vp8_epel_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  c->put_vp8_bilinear_pixels_tab[2][0][0] = ff_put_vp8_pixels4_rvi;
>  }
> +#if HAVE_RVV
> +if (flags & AV_CPU_FLAG_RVV_I32 && ff_get_rv_vlenb() >= 16) {
> +c->put_vp8_bilinear_pixels_tab[0][0][1] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][0][2] =
> ff_put_vp8_bilin16_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_rvv;
> +
> +c->put_vp8_bilinear_pixels_tab[0][1][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[0][2][0] =
> ff_put_vp8_bilin16_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_rvv;
> +c->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_rvv;
> +}
> +#endif
>  #endif
>  }
>
> diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S
> index 8a0773f964..9bf969d794 100644
> --- a/libavcodec/riscv/vp8dsp_rvv.S
> +++ b/libavcodec/riscv/vp8dsp_rvv.S
> @@ -20,6 +20,18 @@
>
>  #include "libavutil/riscv/asm.S"
>
> +.macro vsetvlstatic8 len
> +.if \len <= 4
> +vsetivlizero, \len, e8, mf4, ta, ma
> +.elseif \len <= 8
> +vsetivlizero, \len, e8, mf2, ta, ma
> +.elseif \len <= 16
> +vsetivlizero, \len, e8, m1, ta, ma
> +.elseif \len <= 31
> +vsetivlizero, \len, e8, m2, ta, ma
> +.endif
> +.endm
> +
>  .macro vp8_idct_dc_add
>  vlse32.v  v0, (a0), a2
>  lha5, 0(a1)
> @@ -71,3 +83,40 @@ func ff_vp8_idct_dc_add4uv_rvv, zve32x
>
>  ret
>  endfunc
> +
> +.macro bilin_load dst len type mn
> +.ifc \type,v
> +add t5, a2, a3
> +.elseif \type == h
> +addit5, a2, 1
> +.endif
> +vle8.v  \dst, (a2)
> +vle8.v  v2, (t5)
> +vwmulu.vx   v28, \dst, t1
> +vwmaccu.vx  v28, \mn, v2
> +vwaddu.wx   v24, v28, t4
> +vnsra.wi\dst, v24, 3
> +.endm
> +
> +.macro put_vp8_bilin_h_v len type mn
> +func ff_put_vp8_bilin\len\()_\type\()_rvv, zve32x
> +vsetvlstatic8   \len
> +li  t1, 8
> +li  t4, 4
> +sub t1, t1, \mn
> +1:
> +addia4, a4, -1
> +bilin_load  v0, \len, \type, \mn
> +vse8.v  v0, (a0)
> +add a2, a2, a3
> +add a0, a0, a1
> +bneza4, 1b
> +
> +ret
> +endfunc
> +.endm
> +
> +.irp len 16,8,4
> +put_vp8_bilin_h_v \len h a5
> +put_vp8_bilin_h_v \len v a6
> +.endr
> --
> 2.45.0
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org w

Re: [FFmpeg-devel] [PATCH v3 6/9] lavc/vp8dsp: R-V V put_epel hv

2024-05-06 Thread flow gg
> IMO, passing a complete register name, if you really need to vary it,
would be
simpler and more flexible than an ABI register type prefix.

If the full register name is passed here, some require four parameters,
some require six parameters, and there is often repetition.
I feel it's easy to get confused about the differences between the
parameters passed each time.
If use a prefix instead, would only need one parameter, which I think would
be less error-prone.

> This code actually requires ==, not >=.
> You can do that but you only need half the stack space and offsets.

Ok, fixed it

Rémi Denis-Courmont  于2024年5月7日周二 03:25写道:

> Le maanantaina 6. toukokuuta 2024, 6.38.06 EEST u...@foxmail.com a écrit :
> > From: sunyuechi 
> >
> > C908:
> > vp8_put_epel4_h4v4_c: 20.0
> > vp8_put_epel4_h4v4_rvv_i32: 11.0
> > vp8_put_epel4_h4v6_c: 25.2
> > vp8_put_epel4_h4v6_rvv_i32: 13.5
> > vp8_put_epel4_h6v4_c: 22.2
> > vp8_put_epel4_h6v4_rvv_i32: 14.5
> > vp8_put_epel4_h6v6_c: 29.0
> > vp8_put_epel4_h6v6_rvv_i32: 15.7
> > vp8_put_epel8_h4v4_c: 73.0
> > vp8_put_epel8_h4v4_rvv_i32: 22.2
> > vp8_put_epel8_h4v6_c: 90.5
> > vp8_put_epel8_h4v6_rvv_i32: 26.7
> > vp8_put_epel8_h6v4_c: 85.0
> > vp8_put_epel8_h6v4_rvv_i32: 27.2
> > vp8_put_epel8_h6v6_c: 104.7
> > vp8_put_epel8_h6v6_rvv_i32: 29.5
> > vp8_put_epel16_h4v4_c: 145.5
> > vp8_put_epel16_h4v4_rvv_i32: 26.5
> > vp8_put_epel16_h4v6_c: 190.7
> > vp8_put_epel16_h4v6_rvv_i32: 47.5
> > vp8_put_epel16_h6v4_c: 173.7
> > vp8_put_epel16_h6v4_rvv_i32: 33.2
> > vp8_put_epel16_h6v6_c: 222.2
> > vp8_put_epel16_h6v6_rvv_i32: 35.5
> > ---
> >  libavcodec/riscv/vp8dsp_init.c |  13 
> >  libavcodec/riscv/vp8dsp_rvv.S  | 117 +++--
> >  2 files changed, 109 insertions(+), 21 deletions(-)
> >
> > diff --git a/libavcodec/riscv/vp8dsp_init.c
> b/libavcodec/riscv/vp8dsp_init.c
> > index dc3e087f01..463c8fa0a2 100644
> > --- a/libavcodec/riscv/vp8dsp_init.c
> > +++ b/libavcodec/riscv/vp8dsp_init.c
> > @@ -97,6 +97,19 @@ av_cold void ff_vp78dsp_init_riscv(VP8DSPContext *c)
> >  c->put_vp8_epel_pixels_tab[0][1][0] = ff_put_vp8_epel16_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_rvv;
> >  c->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_rvv;
> > +
> > +c->put_vp8_epel_pixels_tab[0][2][2] =
> ff_put_vp8_epel16_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][2][1] =
> ff_put_vp8_epel16_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][1] =
> ff_put_vp8_epel16_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_rvv;
> > +c->put_vp8_epel_pixels_tab[0][1][2] =
> ff_put_vp8_epel16_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_rvv;
> > +c->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_rvv;
> >  }
> >  #endif
> >  #endif
> > diff --git a/libavcodec/riscv/vp8dsp_rvv.S
> b/libavcodec/riscv/vp8dsp_rvv.S
> > index bf268e4d8d..baa8152830 100644
> > --- a/libavcodec/riscv/vp8dsp_rvv.S
> > +++ b/libavcodec/riscv/vp8dsp_rvv.S
> > @@ -161,26 +161,26 @@ const subpel_filters
> >  .byte 0,  -1,  12, 123,  -6, 0
> >  endconst
> >
> > -.macro epel_filter size type
> > -lla t2, subpel_filters
> > +.macro epel_filter size type regtype
> > +lla \regtype\()2, subpel_filters
> >  .ifc \type,v
> > -addit0, a6, -1
> > +addi\regtype\()0, a6, -1
>
> IMO, passing a complete register name, if you really need to vary it,
> would be
> simpler and more flexible than an ABI register type prefix.
>
> >  .elseif \type == h
> > -addit0, a5, -1
> > +addi\regtype\()0, a5, -1
> >  .endif
> > -li  t1, 6
> > -mul t0, t0, t1
> > -add t0, t0, t2
> > +li  \regtype\()1, 6
> > +mul \regtype\()0, \regtype\()0, \regtype\()1
> > +add \regtype\()0, \regtype\()0, \regtype\()2
> >  .irp n 1,2,3,4
> > -lb  t\n, \n(t0)
> > +lb  \regtype\n, \n(\regtype\()0)
> >  .endr
> >  .ifc \size,6
> > -lb  t5, 5(t0)
> > -lb  t0, (t0)
> > +lb  \regtype\()5, 5(\regtype\()0)
> > +lb  \regtype\()0, (\regtype\()0)
> >  .endif
> >  .endm
> >
> > -.macro epel_load dst len size type
> > +.macro epel_load dst len size type from_mem regtype
> >  .ifc \type,v
> >  mv  a5, a3
> >  .else
> > @@

Re: [FFmpeg-devel] [PATCH v2 1/9] lavc/vp9dsp: R-V ipred vert

2024-05-07 Thread flow gg
Fixed issues similar to vp8

 于2024年5月7日周二 15:36写道:

> From: sunyuechi 
>
> C908:
> vp9_vert_8x8_8bpp_c: 22.0
> vp9_vert_8x8_8bpp_rvi: 15.7
> vp9_vert_16x16_8bpp_c: 71.2
> vp9_vert_16x16_8bpp_rvi: 39.0
> vp9_vert_32x32_8bpp_c: 300.2
> vp9_vert_32x32_8bpp_rvi: 135.2
> ---
>  libavcodec/riscv/Makefile|  1 +
>  libavcodec/riscv/vp9_intra_rvi.S | 61 
>  libavcodec/riscv/vp9dsp.h|  6 
>  libavcodec/riscv/vp9dsp_init.c   | 15 ++--
>  4 files changed, 80 insertions(+), 3 deletions(-)
>  create mode 100644 libavcodec/riscv/vp9_intra_rvi.S
>
> diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
> index 050c08ee61..65dd0d656a 100644
> --- a/libavcodec/riscv/Makefile
> +++ b/libavcodec/riscv/Makefile
> @@ -63,6 +63,7 @@ RVV-OBJS-$(CONFIG_VC1DSP) += riscv/vc1dsp_rvv.o
>  OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_init.o
>  RVV-OBJS-$(CONFIG_VP8DSP) += riscv/vp8dsp_rvv.o
>  OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9dsp_init.o
> +RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o
>  RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o
>  OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o
>  RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o
> diff --git a/libavcodec/riscv/vp9_intra_rvi.S
> b/libavcodec/riscv/vp9_intra_rvi.S
> new file mode 100644
> index 00..617f9f55a2
> --- /dev/null
> +++ b/libavcodec/riscv/vp9_intra_rvi.S
> @@ -0,0 +1,61 @@
> +/*
> + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences
> (ISCAS).
> + *
> + * This file is part of FFmpeg.
> + *
> + * FFmpeg is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * FFmpeg is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with FFmpeg; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> 02110-1301 USA
> + */
> +
> +#include "libavutil/riscv/asm.S"
> +
> +#if __riscv_xlen >= 64
> +func ff_v_32x32_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +ld   t2, 16(a3)
> +ld   t3, 24(a3)
> +.rept 32
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +sd   t2, 16(a0)
> +sd   t3, 24(a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_16x16_rvi
> +ld   t0, (a3)
> +ld   t1, 8(a3)
> +.rept 16
> +sd   t0, (a0)
> +sd   t1, 8(a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +
> +func ff_v_8x8_rvi
> +ld   t0, (a3)
> +.rept 8
> +sd   t0, (a0)
> +add  a0, a0, a1
> +.endr
> +
> +ret
> +endfunc
> +#endif
> diff --git a/libavcodec/riscv/vp9dsp.h b/libavcodec/riscv/vp9dsp.h
> index 25047ed507..f8bc6563a5 100644
> --- a/libavcodec/riscv/vp9dsp.h
> +++ b/libavcodec/riscv/vp9dsp.h
> @@ -60,6 +60,12 @@ void ff_dc_129_16x16_rvv(uint8_t *dst, ptrdiff_t
> stride, const uint8_t *l,
>   const uint8_t *a);
>  void ff_dc_129_8x8_rvv(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> const uint8_t *a);
> +void ff_v_32x32_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_16x16_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +const uint8_t *a);
> +void ff_v_8x8_rvi(uint8_t *dst, ptrdiff_t stride, const uint8_t *l,
> +  const uint8_t *a);
>
>  #define VP9_8TAP_RISCV_RVV_FUNC(SIZE, type, type_idx)
>  \
>  void ff_put_8tap_##type##_##SIZE##h_rvv(uint8_t *dst, ptrdiff_t
> dststride,   \
> diff --git a/libavcodec/riscv/vp9dsp_init.c
> b/libavcodec/riscv/vp9dsp_init.c
> index 69ab39004c..d249dd71b2 100644
> --- a/libavcodec/riscv/vp9dsp_init.c
> +++ b/libavcodec/riscv/vp9dsp_init.c
> @@ -24,11 +24,19 @@
>  #include "libavcodec/vp9dsp.h"
>  #include "vp9dsp.h"
>
> -static av_cold void vp9dsp_intrapred_init_rvv(VP9DSPContext *dsp, int bpp)
> +static av_cold void vp9dsp_intrapred_init_riscv(VP9DSPContext *dsp, int
> bpp)
>  {
> -#if HAVE_RVV
> +#if HAVE_RV
>  int flags = av_get_cpu_flags();
>
> +if (bpp == 8 && flags & AV_CPU_FLAG_RVI) {
> +# if __riscv_xlen >= 64
> +dsp->intra_pred[TX_32X32][VERT_PRED] = ff_v_32x32_rvi;
> +dsp->intra_pred[TX_16X16][VERT_PRED] = ff_v_16x16_rvi;
> +dsp->intra_pred[TX

  1   2   3   4   >