Le tiistaina 21. toukokuuta 2024, 10.37.51 EEST u...@foxmail.com a écrit : > From: sunyuechi <sunyue...@iscas.ac.cn> > --- > libavcodec/riscv/Makefile | 2 + > libavcodec/riscv/vvc_mc_rvv.S | 312 +++++++++++++++++++++++++++++++++ > libavcodec/riscv/vvcdsp_init.c | 76 ++++++++ > libavcodec/vvc/dsp.c | 4 +- > libavcodec/vvc/dsp.h | 1 + > 5 files changed, 394 insertions(+), 1 deletion(-) > create mode 100644 libavcodec/riscv/vvc_mc_rvv.S > create mode 100644 libavcodec/riscv/vvcdsp_init.c > > diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile > index 27b268ae39..6297664fc9 100644 > --- a/libavcodec/riscv/Makefile > +++ b/libavcodec/riscv/Makefile > @@ -68,3 +68,5 @@ RV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvi.o \ > RVV-OBJS-$(CONFIG_VP9_DECODER) += riscv/vp9_intra_rvv.o > OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_init.o > RVV-OBJS-$(CONFIG_VORBIS_DECODER) += riscv/vorbisdsp_rvv.o > +OBJS-$(CONFIG_VVC_DECODER) += riscv/vvcdsp_init.o > +RVV-OBJS-$(CONFIG_VVC_DECODER) += riscv/vvc_mc_rvv.o > diff --git a/libavcodec/riscv/vvc_mc_rvv.S b/libavcodec/riscv/vvc_mc_rvv.S > new file mode 100644 > index 0000000000..26a6afba1f > --- /dev/null > +++ b/libavcodec/riscv/vvc_mc_rvv.S > @@ -0,0 +1,312 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "libavutil/riscv/asm.S" > + > +.macro vsetvlstatic8 w vlen is_w > + .if \w <= 2 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e8, mf8, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e8, mf4, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e8, m1, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e8, mf2, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e8, m1, ta, ma > + .elseif \w <= (\vlen / 4) || \is_w > + li t0, 64 > + vsetvli zero, t0, e8, m2, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e8, m4, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic16 w vlen is_w > + .if \w <= 2 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e16, mf4, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e16, mf2, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e16, m2, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e16, m1, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e16, m2, ta, ma > + .elseif \w <= (\vlen / 4) || \is_w > + li t0, 64 > + vsetvli zero, t0, e16, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e16, m8, ta, ma > + .endif > +.endm > + > +.macro vsetvlstatic32 w vlen > + .if \w <= 2 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w <= 4 && \vlen == 128 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w <= 4 && \vlen >= 256 > + vsetivli zero, \w, e32, mf2, ta, ma > + .elseif \w <= 8 && \vlen == 128 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w <= 8 && \vlen >= 256 > + vsetivli zero, \w, e32, m1, ta, ma > + .elseif \w <= 16 && \vlen == 128 > + vsetivli zero, \w, e32, m4, ta, ma > + .elseif \w <= 16 && \vlen >= 256 > + vsetivli zero, \w, e32, m2, ta, ma > + .elseif \w <= 32 && \vlen >= 256 > + li t0, \w > + vsetvli zero, t0, e32, m4, ta, ma > + .else > + li t0, \w > + vsetvli zero, t0, e32, m8, ta, ma > + .endif > +.endm > + > +.macro avg_nx1 w vlen > + vsetvlstatic16 \w, \vlen, 0 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vadd.vv v8, v8, v0 > + vmax.vx v8, v8, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v8, v8, 7 > + vse8.v v8, (a0) > +.endm > + > +.macro avg w h vlen > + csrw vxrm, zero > + > +.if \w <= (\vlen / 4) && \h >= 4 > +.rept (\h / 4) > + vsetvlstatic16 \w, \vlen, 0 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + addi t3, a2, 128*2*2 > + addi t4, a3, 128*2*2 > + addi a7, a3, 128*2*3 > + addi t6, a2, 128*2*3 > + add t2, a0, a1 > + sh1add t5, a1, a0 > + add a6, t5, a1 > + vle16.v v0, (a2) > + vle16.v v4, (a3) > + vle16.v v8, (t0) > + vle16.v v12, (t1) > + vle16.v v16, (t3) > + vle16.v v20, (t4) > + vle16.v v24, (t6) > + vle16.v v28, (a7)
I would expect that you can get better performance by interleaving scalar and vector stuff, and possibly also vector loads and vector arithmetic. > + vadd.vv v4, v4, v0 > + vadd.vv v12, v12, v8 > + vadd.vv v20, v20, v16 > + vadd.vv v28, v28, v24 > + vmax.vx v4, v4, zero > + vmax.vx v12, v12, zero > + vmax.vx v20, v20, zero > + vmax.vx v28, v28, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v4, v4, 7 > + vnclipu.wi v12, v12, 7 > + vnclipu.wi v20, v20, 7 > + vnclipu.wi v28, v28, 7 > + vse8.v v4, (a0) > + vse8.v v12, (t2) > + vse8.v v20, (t5) > + vse8.v v28, (a6) > + addi a2, a2, 128*8 > + addi a3, a3, 128*8 > + sh2add a0, a1, a0 > +.endr > + > +.elseif (\w <= (\vlen / 4) && \h == 2) || (\w == (\vlen / 2)) > +.rept (\h / 2) > + vsetvlstatic16 \w, \vlen, 0 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + add t2, a0, a1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vle16.v v16, (t0) > + vle16.v v24, (t1) > + vadd.vv v8, v8, v0 > + vadd.vv v24, v24, v16 > + vmax.vx v8, v8, zero > + vmax.vx v24, v24, zero > + vsetvlstatic8 \w, \vlen, 0 > + vnclipu.wi v8, v8, 7 > + vnclipu.wi v24, v24, 7 > + vse8.v v8, (a0) > + vse8.v v24, (t2) > + addi a2, a2, 128*4 > + addi a3, a3, 128*4 > + sh1add a0, a1, a0 > +.endr > + > +.else > +.rept \h > + avg_nx1 \w, \vlen > + .if \w == 128 && \vlen == 128 > + addi a2, a2, 64*2 > + addi a3, a3, 64*2 > + addi a0, a0, 64 > + avg_nx1 \w, \vlen > + addi a2, a2, -64*2 > + addi a3, a3, -64*2 > + addi a0, a0, -64 > + .endif > + addi a2, a2, 128*2 > + addi a3, a3, 128*2 > + add a0, a0, a1 > +.endr > +.endif > +.endm > + > +.macro w_avg_nx1 w vlen > + vsetvlstatic16 \w, \vlen, 1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vwmul.vx v16, v0, a7 > + vwmacc.vx v16, t3, v8 > + vsetvlstatic32 \w, \vlen > + vadd.vx v16, v16, t4 > + vsetvlstatic16 \w, \vlen, 1 > + vnsrl.wx v16, v16, t6 > + vmax.vx v16, v16, zero > + vsetvlstatic8 \w, \vlen, 1 > + vnclipu.wi v16, v16, 0 > + vse8.v v16, (a0) > +.endm > + > +#if (__riscv_xlen == 64) > +.macro w_avg w h vlen > + csrw vxrm, zero > + addi t6, a6, 7 > + ld t3, (sp) > + ld t4, 8(sp) > + ld t5, 16(sp) > + add t4, t4, t5 > + addi t4, t4, 1 // o0 + o1 + 1 > + addi t5, t6, -1 // shift - 1 > + sll t4, t4, t5 > + > +.if \w <= (\vlen / 8) > + .rept (\h / 2) > + vsetvlstatic16 \w, \vlen, 1 > + addi t0, a2, 128*2 > + addi t1, a3, 128*2 > + add t2, a0, a1 > + vle16.v v0, (a2) > + vle16.v v8, (a3) > + vle16.v v20, (t0) > + vle16.v v24, (t1) > + vwmul.vx v16, v0, a7 > + vwmul.vx v28, v20, a7 > + vwmacc.vx v16, t3, v8 > + vwmacc.vx v28, t3, v24 > + vsetvlstatic32 \w, \vlen > + vadd.vx v16, v16, t4 > + vadd.vx v28, v28, t4 > + vsetvlstatic16 \w, \vlen, 1 > + vnsrl.wx v16, v16, t6 > + vnsrl.wx v28, v28, t6 > + vmax.vx v16, v16, zero > + vmax.vx v28, v28, zero > + vsetvlstatic8 \w, \vlen, 1 > + vnclipu.wi v16, v16, 0 > + vnclipu.wi v28, v28, 0 > + vse8.v v16, (a0) > + vse8.v v28, (t2) > + addi a2, a2, 128*4 > + addi a3, a3, 128*4 > + sh1add a0, a1, a0 > + .endr > +.else > + .rept \h > + w_avg_nx1 \w, \vlen > + .if \w == (\vlen / 2) > + addi a2, a2, (\vlen / 2) > + addi a3, a3, (\vlen / 2) > + addi a0, a0, (\vlen / 4) > + w_avg_nx1 \w, \vlen > + addi a2, a2, -(\vlen / 2) > + addi a3, a3, -(\vlen / 2) > + addi a0, a0, -(\vlen / 4) > + .elseif \w == 128 && \vlen == 128 > + .rept 3 > + addi a2, a2, (\vlen / 2) > + addi a3, a3, (\vlen / 2) > + addi a0, a0, (\vlen / 4) > + w_avg_nx1 \w, \vlen > + .endr > + addi a2, a2, -(\vlen / 2) * 3 > + addi a3, a3, -(\vlen / 2) * 3 > + addi a0, a0, -(\vlen / 4) * 3 > + .endif > + > + addi a2, a2, 128*2 > + addi a3, a3, 128*2 > + add a0, a0, a1 > + .endr > +.endif > +.endm > +#endif > + > +.macro func_avg name vlen > +func ff_vvc_\name\()_8_rvv_\vlen\(), zve32x > +.irp w,2,4,8,16,32,64,128 > + li t3, \w > + bne a4, t3, \name\vlen\()end\w > +.irp h,2,4,8,16,32,64,128 > + li t4, \h > + bne a5, t4, \name\vlen\()end\w\h > + \name \w \h \vlen > + ret > +\name\vlen\()end\w\h: > +.endr > +\name\vlen\()end\w: These labels lead to nowhere? If you actually mean to implicitly fall through to the next function, you can use the function name directly rather than add odd labels. > +.endr > +endfunc > +.endm > + > +func_avg avg 256 > +func_avg avg 128 > +#if (__riscv_xlen == 64) > +func_avg w_avg 256 > +func_avg w_avg 128 > +#endif > diff --git a/libavcodec/riscv/vvcdsp_init.c b/libavcodec/riscv/vvcdsp_init.c > new file mode 100644 > index 0000000000..d26b4c1c4a > --- /dev/null > +++ b/libavcodec/riscv/vvcdsp_init.c > @@ -0,0 +1,76 @@ > +/* > + * Copyright (c) 2024 Institue of Software Chinese Academy of Sciences > (ISCAS). + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA + */ > + > +#include "config.h" > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/riscv/cpu.h" > +#include "libavcodec/vvc/dsp.h" > + > +#define bf(fn, bd, opt) fn##_##bd##_##opt > + > +#define AVG_PROTOTYPES(bd, opt) > \ +void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, > ptrdiff_t dst_stride, \ + const > int16_t *src0, const int16_t *src1, int width, int height); > \ +void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t > dst_stride, \ + const int16_t *src0, > const int16_t *src1, int width, int height, > \ + int denom, int w0, int w1, int o0, int o1); > + > +AVG_PROTOTYPES(8, rvv_128) > +AVG_PROTOTYPES(8, rvv_256) > + > +#define AVG_INIT(bd, opt) do { \ > + c->inter.avg = bf(ff_vvc_avg, bd, opt); \ > + c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \ > +} while (0) > + > +void ff_vvc_dsp_init_riscv(VVCDSPContext *const c, const int bd) > +{ > +#if HAVE_RVV > + const int flags = av_get_cpu_flags(); > + > + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB_ADDR) && > + ff_rv_vlen_least(256)) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_256; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_256; > +# endif > + break; > + default: > + break; > + } > + } else if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & > AV_CPU_FLAG_RVB_ADDR) && > + ff_rv_vlen_least(128)) { > + switch (bd) { > + case 8: > + c->inter.avg = ff_vvc_avg_8_rvv_128; > +# if (__riscv_xlen == 64) > + c->inter.w_avg = ff_vvc_w_avg_8_rvv_128; > +# endif > + break; > + default: > + break; > + } > + } > +#endif > +} > diff --git a/libavcodec/vvc/dsp.c b/libavcodec/vvc/dsp.c > index 41e830a98a..c55a37d255 100644 > --- a/libavcodec/vvc/dsp.c > +++ b/libavcodec/vvc/dsp.c > @@ -121,7 +121,9 @@ void ff_vvc_dsp_init(VVCDSPContext *vvcdsp, int > bit_depth) break; > } > > -#if ARCH_X86 > +#if ARCH_RISCV > + ff_vvc_dsp_init_riscv(vvcdsp, bit_depth); > +#elif ARCH_X86 > ff_vvc_dsp_init_x86(vvcdsp, bit_depth); > #endif > } > diff --git a/libavcodec/vvc/dsp.h b/libavcodec/vvc/dsp.h > index 9810ac314c..dcb978549f 100644 > --- a/libavcodec/vvc/dsp.h > +++ b/libavcodec/vvc/dsp.h > @@ -167,6 +167,7 @@ typedef struct VVCDSPContext { > > void ff_vvc_dsp_init(VVCDSPContext *hpc, int bit_depth); > > +void ff_vvc_dsp_init_riscv(VVCDSPContext *hpc, const int bit_depth); > void ff_vvc_dsp_init_x86(VVCDSPContext *hpc, const int bit_depth); > > #endif /* AVCODEC_VVC_DSP_H */ -- Rémi Denis-Courmont http://www.remlab.net/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".