From: daichengrong <daichengr...@iscas.ac.cn> riscv/hevcdsp_idct_rvv: Optimize idct_32x32_8
On Banana PI F3: hevc_idct_32x32_8_c: 119579.3 ( 1.00x) hevc_idct_32x32_8_rvv_i64: 51254.4 ( 2.33x) Signed-off-by: daichengrong <daichengr...@iscas.ac.cn> --- libavcodec/riscv/Makefile | 1 + libavcodec/riscv/hevcdsp_idct_rvv.S | 1042 +++++++++++++++++++++++++++ libavcodec/riscv/hevcdsp_init.c | 52 +- 3 files changed, 1075 insertions(+), 20 deletions(-) create mode 100644 libavcodec/riscv/hevcdsp_idct_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index a80d2fa2e7..dfc33afbee 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -36,6 +36,7 @@ RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o +OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_idct_rvv.o RVV-OBJS-$(CONFIG_HEVC_DECODER) += riscv/h26x/h2656_inter_rvv.o OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_init.o RVV-OBJS-$(CONFIG_HUFFYUV_DECODER) += riscv/huffyuvdsp_rvv.o diff --git a/libavcodec/riscv/hevcdsp_idct_rvv.S b/libavcodec/riscv/hevcdsp_idct_rvv.S new file mode 100644 index 0000000000..f8dd2e5bf4 --- /dev/null +++ b/libavcodec/riscv/hevcdsp_idct_rvv.S @@ -0,0 +1,1042 @@ +/* + * Copyright (c) 2025 Institue of Software Chinese Academy of Sciences (ISCAS). + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/riscv/asm.S" + +const trans, align=4 + .2byte 64, 83, 64, 36 + .2byte 89, 75, 50, 18 + .2byte 90, 87, 80, 70 + .2byte 57, 43, 25, 9 + .2byte 90, 90, 88, 85 + .2byte 82, 78, 73, 67 + .2byte 61, 54, 46, 38 + .2byte 31, 22, 13, 4 +endconst + +.macro sum_sub out, in, c, op, p + vsetivli t0, 4, e16, mf2, tu, ma + .ifc \op, + + .ifc \p, 2 + vslidedown.vi v8, \in, 4 + vwmacc.vx \out, \c, v8 + .else + vwmacc.vx \out, \c, \in + .endif + .else + .ifc \p, 2 + neg \c, \c + vslidedown.vi v8, \in, 4 + vwmacc.vx \out, \c, v8 + neg \c, \c + .else + neg \c, \c + vwmacc.vx \out, \c, \in + neg \c, \c + .endif + .endif +.endm + +.macro add_member32 in, t0, index0, t1, index1, t2, index2, t3, index3, op0, op1, op2, op3, p + vsetivli t0, 1, e16, m1, tu, ma + vslidedown.vi v12, \t0, \index0 + vmv.x.s s2, v12 + vslidedown.vi v12, \t1, \index1 + vmv.x.s s3, v12 + vslidedown.vi v12, \t2, \index2 + vmv.x.s s4, v12 + vslidedown.vi v12, \t3, \index3 + vmv.x.s s5, v12 + + sum_sub v24, \in, s2, \op0, \p + sum_sub v25, \in, s3, \op1, \p + sum_sub v26, \in, s4, \op2, \p + sum_sub v27, \in, s5, \op3, \p +.endm + +.macro butterfly e, o, tmp_p, tmp_m + vsetivli t0, 4, e32, m1, tu, ma + vadd.vv \tmp_p, \e, \o + vsub.vv \tmp_m, \e, \o +.endm + +.macro butterfly16 in0, in1, in2, in3, in4, in5, in6, in7 + vsetivli t0, 4, e32, m1, tu, ma + vadd.vv v20, \in0, \in1 + vsub.vv \in0, \in0, \in1 + vadd.vv \in1, \in2, \in3 + vsub.vv \in2, \in2, \in3 + vadd.vv \in3, \in4, \in5 + vsub.vv \in4, \in4, \in5 + vadd.vv \in5, \in6, \in7 + vsub.vv \in6, \in6, \in7 +.endm + +.macro multiply in + vsetivli t0, 1, e16, m1, tu, ma + vmv.x.s s2, \in + vslidedown.vi v12, \in, 1 + vmv.x.s s3, v12 + vslidedown.vi v12, \in, 2 + vmv.x.s s4, v12 + vslidedown.vi v12, \in, 3 + vmv.x.s s5, v12 + + vsetivli t0, 4, e16, mf2, tu, ma + vwmul.vx v24, v4, s2 + vwmul.vx v25, v4, s3 + vwmul.vx v26, v4, s4 + vwmul.vx v27, v4, s5 +.endm + +func tr_block1, zve64x + multiply v0 + add_member32 v4, v0, 1, v1, 0, v1, 3, v2, 2, +, +, +, +, 2 + add_member32 v5, v0, 2, v1, 3, v3, 0, v3, 2, +, +, +, - + add_member32 v5, v0, 3, v2, 2, v3, 2, v1, 3, +, +, -, -, 2 + add_member32 v6, v1, 0, v3, 1, v2, 1, v0, 0, +, +, -, - + add_member32 v6, v1, 1, v3, 3, v1, 0, v1, 2, +, -, -, -, 2 + add_member32 v7, v1, 2, v3, 0, v0, 0, v3, 1, +, -, -, - + add_member32 v7, v1, 3, v2, 1, v1, 1, v2, 3, +, -, -, +, 2 + add_member32 v16, v2, 0, v1, 2, v2, 2, v1, 0, +, -, -, + + add_member32 v16, v2, 1, v0, 3, v3, 3, v0, 2, +, -, -, +, 2 + add_member32 v17, v2, 2, v0, 1, v2, 3, v2, 1, +, -, +, + + add_member32 v17, v2, 3, v0, 2, v1, 2, v3, 3, +, -, +, -, 2 + add_member32 v18, v3, 0, v1, 1, v0, 1, v2, 0, +, -, +, - + add_member32 v18, v3, 1, v2, 0, v0, 3, v0, 1, +, -, +, -, 2 + add_member32 v19, v3, 2, v2, 3, v2, 0, v1, 1, +, -, +, - + add_member32 v19, v3, 3, v3, 2, v3, 1, v3, 0, +, -, +, -, 2 + ret +endfunc + +func tr_block2, zve64x + multiply v1 + add_member32 v4, v3, 1, v3, 3, v3, 0, v2, 1, +, -, -, -, 2 + add_member32 v5, v2, 1, v1, 0, v0, 0, v1, 1, -, -, -, - + add_member32 v5, v0, 0, v1, 2, v3, 1, v2, 3, -, -, -, +, 2 + add_member32 v6, v2, 0, v3, 2, v1, 1, v0, 3, -, +, +, + + add_member32 v6, v3, 2, v0, 3, v1, 3, v3, 1, +, +, +, -, 2 + add_member32 v7, v1, 1, v1, 3, v2, 3, v0, 0, +, +, -, - + add_member32 v7, v0, 3, v3, 1, v0, 1, v3, 3, +, -, -, +, 2 + add_member32 v16, v3, 0, v0, 2, v3, 2, v0, 1, +, -, -, + + add_member32 v16, v2, 2, v2, 0, v1, 0, v3, 2, -, -, +, +, 2 + add_member32 v17, v0, 1, v3, 0, v2, 0, v0, 2, -, +, +, - + add_member32 v17, v1, 3, v0, 1, v2, 2, v3, 0, -, +, -, -, 2 + add_member32 v18, v3, 3, v2, 1, v0, 2, v1, 0, +, +, -, + + add_member32 v18, v1, 2, v2, 3, v3, 3, v2, 2, +, -, -, +, 2 + add_member32 v19, v0, 2, v0, 1, v0, 3, v1, 2, +, -, +, - + add_member32 v19, v2, 3, v2, 2, v2, 1, v2, 0, +, -, +, -, 2 + ret +endfunc + +func tr_block3, zve64x + multiply v2 + add_member32 v4, v1, 2, v0, 3, v0, 0, v0, 2, -, -, -, -, 2 + add_member32 v5, v2, 2, v3, 3, v2, 3, v1, 2, -, -, +, + + add_member32 v5, v1, 0, v0, 2, v2, 1, v3, 3, +, +, +, -, 2 + add_member32 v6, v3, 0, v2, 2, v0, 1, v1, 3, +, -, -, - + add_member32 v6, v0, 2, v2, 0, v3, 0, v0, 0, -, -, +, +, 2 + add_member32 v7, v3, 2, v1, 0, v2, 0, v2, 2, -, +, +, - + add_member32 v7, v0, 0, v3, 2, v0, 2, v3, 0, +, +, -, -, 2 + add_member32 v16, v3, 3, v0, 1, v3, 1, v0, 3, -, -, +, + + add_member32 v16, v0, 1, v2, 3, v1, 3, v1, 1, -, +, +, -, 2 + add_member32 v17, v3, 1, v1, 3, v0, 3, v3, 2, +, +, -, + + add_member32 v17, v0, 3, v1, 1, v3, 2, v2, 0, +, -, +, +, 2 + add_member32 v18, v2, 3, v3, 1, v1, 2, v0, 1, -, -, +, - + add_member32 v18, v1, 1, v0, 0, v1, 0, v2, 1, -, +, -, +, 2 + add_member32 v19, v2, 1, v3, 0, v3, 3, v3, 1, +, -, +, + + add_member32 v19, v1, 3, v1, 2, v1, 1, v1, 0, +, -, +, -, 2 + ret +endfunc + +func tr_block4, zve64x + multiply v3 + add_member32 v4, v1, 1, v2, 0, v2, 3, v3, 2, -, -, -, -, 2 + add_member32 v5, v0, 0, v0, 3, v2, 0, v3, 1, +, +, +, + + add_member32 v5, v2, 0, v0, 0, v1, 1, v3, 0, -, -, -, -, 2 + add_member32 v6, v3, 3, v1, 2, v0, 2, v2, 3, +, +, +, + + add_member32 v6, v2, 1, v2, 3, v0, 0, v2, 2, +, -, -, -, 2 + add_member32 v7, v0, 2, v3, 3, v0, 3, v2, 1, -, -, +, + + add_member32 v7, v1, 0, v2, 2, v1, 2, v2, 0, +, +, -, -, 2 + add_member32 v16, v2, 3, v1, 1, v2, 1, v1, 3, -, -, +, + + add_member32 v16, v3, 1, v0, 1, v3, 0, v1, 2, -, +, -, -, 2 + add_member32 v17, v1, 2, v1, 0, v3, 3, v1, 1, +, -, +, + + add_member32 v17, v0, 1, v2, 1, v3, 1, v1, 0, -, +, +, -, 2 + add_member32 v18, v1, 3, v3, 2, v2, 2, v0, 3, +, -, -, + + add_member32 v18, v3, 2, v3, 0, v1, 3, v0, 2, -, -, +, -, 2 + add_member32 v19, v2, 2, v1, 3, v1, 0, v0, 1, -, +, -, + + add_member32 v19, v0, 3, v0, 2, v0, 1, v0, 0, +, -, +, -, 2 + ret +endfunc + +.macro butterfly32 in0, in1, in2, in3, out + vsetivli t0, 4, e32, m1, tu, ma + vadd.vv \out, \in0, \in1 + vsub.vv \in0, \in0, \in1 + vadd.vv \in1, \in2, \in3 + vsub.vv \in2, \in2, \in3 +.endm + +.macro load16 in0, in1, in2, in3 + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \in0, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v \in0, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \in1, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v \in1, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \in2, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v \in2, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \in3, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v \in3, (a1) + add a1, a1, a2 + add a3, a3, a2 +.endm + +.macro store16 in0, in1, in2, in3, rx + vsetivli t0, 1, e64, m1, tu, ma + vse64.v \in0, (a1) + vsetivli t0, 2, e64, m1, tu, ma + vslide1down.vx v8, \in0, zero + vsetivli t0, 1, e64, m1, tu, ma + vse64.v v8, (a3) + add a1, a1, a2 + add a3, a3, \rx + + vsetivli t0, 1, e64, m1, tu, ma + vse64.v \in1, (a1) + vsetivli t0, 2, e64, m1, tu, ma + vslide1down.vx v8, \in1, zero + vsetivli t0, 1, e64, m1, tu, ma + vse64.v v8, (a3) + add a1, a1, a2 + add a3, a3, \rx + + vsetivli t0, 1, e64, m1, tu, ma + vse64.v \in2, (a1) + vsetivli t0, 2, e64, m1, tu, ma + vslide1down.vx v8, \in2, zero + vsetivli t0, 1, e64, m1, tu, ma + vse64.v v8, (a3) + add a1, a1, a2 + add a3, a3, \rx + + vsetivli t0, 1, e64, m1, tu, ma + vse64.v \in3, (a1) + vsetivli t0, 2, e64, m1, tu, ma + vslide1down.vx v8, \in3, zero + vsetivli t0, 1, e64, m1, tu, ma + vse64.v v8, (a3) + add a1, a1, a2 + add a3, a3, \rx + +.endm + +.macro load32 + addi a1, a5, 64 + addi a3, a1, 128 + li a2, 256 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v4, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v4, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v5, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v5, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v6, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v6, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v7, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v7, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v16, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v16, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v17, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v17, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v18, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v18, (a1) + add a1, a1, a2 + add a3, a3, a2 + + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v8, (a3) + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx v19, v8, zero + vsetivli t0, 1, e64, m1, tu, ma + vle64.v v19, (a1) + add a1, a1, a2 + add a3, a3, a2 + +.endm + + + +.macro add_member in, tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7, op0, op1, op2, op3, op4, op5, op6, op7, p + sum_sub v21, \in, \tt0, \op0, \p + sum_sub v22, \in, \tt1, \op1, \p + sum_sub v23, \in, \tt2, \op2, \p + sum_sub v24, \in, \tt3, \op3, \p + sum_sub v25, \in, \tt4, \op4, \p + sum_sub v26, \in, \tt5, \op5, \p + sum_sub v27, \in, \tt6, \op6, \p + sum_sub v28, \in, \tt7, \op7, \p +.endm + +.macro scale_store shift + vsetivli t0, 8, e16, m1, tu, ma + vle16.v v28, (a4) + addi a4, a4, 2*8 + vle16.v v29, (a4) + addi a4, a4, 2*8 + vle16.v v30, (a4) + addi a4, a4, 2*8 + vle16.v v31, (a4) + addi a4, a4, 2*8 + + butterfly32 v28, v24, v29, v25, v2 + butterfly32 v30, v26, v31, v27, v3 + + scale v20, v21, v22, v23, v2, v28, v24, v29, v3, v30, v26, v31, \shift + + transpose16_4x4_2 v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 + + store16 v20, v21, v22, v23, t1 + + vsetivli t0, 4, e16, m1, tu, ma + vle16.v v2, (t2) + addi t2, t2, 8 + vle16.v v3, (t2) + addi t2, t2, -8 +.endm + +.macro store_to_stack off1, off2, in0, in2, in4, in6, in7, in5, in3, in1 + li a7, \off1 + add a1, sp, a7 + li a7, \off2 + add a3, sp, a7 + li a2, -16 + li a4, 16 + + vsetivli t0, 4, e32, m1, tu, ma + vse32.v \in0, (a1) + add a1, a1, a4 + vse32.v \in1, (a3) + add a3, a3, a2 + vse32.v \in2, (a1) + add a1, a1, a4 + vse32.v \in3, (a3) + add a3, a3, a2 + vse32.v \in4, (a1) + add a1, a1, a4 + vse32.v \in5, (a3) + add a3, a3, a2 + vse32.v \in6, (a1) + vse32.v \in7, (a3) +.endm + +.macro transpose16_4x4_2 r0, r1, r2, r3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5 + // lower halves + vsetivli t0, 1, e16, m1, tu, ma + vmv.v.v \tmp0\(), \r0\() + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp0\(), \r1\(), 1 + vslidedown.vi v8, \r0\(), 2 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 2 + vslidedown.vi v8, \r1\(), 2 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 3 + vsetivli t0, 1, e16, m1, tu, ma + vslidedown.vi v8, \r0\(), 1 + vmv.v.v \tmp1\(), v8 + vslidedown.vi v8, \r1\(), 1 + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 1 + vslidedown.vi v8, \r0\(), 3 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 2 + vslidedown.vi v8, \r1\(), 3 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 3 + vsetivli t0, 1, e16, m1, tu, ma + vmv.v.v \tmp2\(), \r2\() + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp2\(), \r3\(), 1 + vslidedown.vi v8, \r2\(), 2 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 2 + vslidedown.vi v8, \r3\(), 2 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 3 + vsetivli t0, 1, e16, m1, tu, ma + vslidedown.vi v8, \r2\(), 1 + vmv.v.v \tmp3\(), v8 + vslidedown.vi v8, \r3\(), 1 + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 1 + vslidedown.vi v8, \r2\(), 3 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 2 + vslidedown.vi v8, \r3\(), 3 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 3 + vsetivli t0, 1, e32, m1, tu, ma + vmv.v.v \tmp4\(), \tmp0\() + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp4\(), \tmp2\(), 1 + vsetivli t0, 1, e32, m1, tu, ma + vslidedown.vi v8, \tmp0\(), 1 + vmv.v.v \tmp5\(), v8 + vslidedown.vi v8, \tmp2\(), 1 + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp5\(), v8, 1 + vsetivli t0, 1, e32, m1, tu, ma + vmv.v.v \tmp0\(), \tmp1\() + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp0\(), \tmp3\(), 1 + vsetivli t0, 1, e32, m1, tu, ma + vslidedown.vi v8, \tmp1\(), 1 + vmv.v.v \tmp2\(), v8 + vslidedown.vi v8, \tmp3\(), 1 + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp2\(), v8, 1 + + vsetivli t0, 1, e64, m1, tu, ma + vmv.v.v \r0\(), \tmp4\() + vmv.v.v \r2\(), \tmp5\() + vmv.v.v \r1\(), \tmp0\() + vmv.v.v \r3\(), \tmp2\() + + vsetivli t0, 1, e16, m1, tu, ma + vmv.v.v \tmp0\(), \r3\() + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp0\(), \r2\(), 1 + vslidedown.vi v8, \r3\(), 2 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 2 + vslidedown.vi v8, \r2\(), 2 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 3 + vslidedown.vi v8, \r3\(), 4 + vsetivli t0, 5, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 4 + vslidedown.vi v8, \r2\(), 4 + vsetivli t0, 6, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 5 + vslidedown.vi v8, \r3\(), 6 + vsetivli t0, 7, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 6 + vslidedown.vi v8, \r2\(), 6 + vsetivli t0, 8, e16, m1, tu, ma + vslideup.vi \tmp0\(), v8, 7 + vsetivli t0, 1, e16, m1, tu, ma + vslidedown.vi v8, \r3\(), 1 + vmv.v.v \tmp1\(), v8 + vslidedown.vi v8, \r2\(), 1 + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 1 + vslidedown.vi v8, \r3\(), 3 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 2 + vslidedown.vi v8, \r2\(), 3 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 3 + vslidedown.vi v8, \r3\(), 5 + vsetivli t0, 5, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 4 + vslidedown.vi v8, \r2\(), 5 + vsetivli t0, 6, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 5 + vslidedown.vi v8, \r3\(), 7 + vsetivli t0, 7, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 6 + vslidedown.vi v8, \r2\(), 7 + vsetivli t0, 8, e16, m1, tu, ma + vslideup.vi \tmp1\(), v8, 7 + vsetivli t0, 1, e16, m1, tu, ma + vmv.v.v \tmp2\(), \r1\() + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp2\(), \r0\(), 1 + vslidedown.vi v8, \r1\(), 2 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 2 + vslidedown.vi v8, \r0\(), 2 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 3 + vslidedown.vi v8, \r1\(), 4 + vsetivli t0, 5, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 4 + vslidedown.vi v8, \r0\(), 4 + vsetivli t0, 6, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 5 + vslidedown.vi v8, \r1\(), 6 + vsetivli t0, 7, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 6 + vslidedown.vi v8, \r0\(), 6 + vsetivli t0, 8, e16, m1, tu, ma + vslideup.vi \tmp2\(), v8, 7 + vsetivli t0, 1, e16, m1, tu, ma + vslidedown.vi v8, \r1\(), 1 + vmv.v.v \tmp3\(), v8 + vslidedown.vi v8, \r0\(), 1 + vsetivli t0, 2, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 1 + vslidedown.vi v8, \r1\(), 3 + vsetivli t0, 3, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 2 + vslidedown.vi v8, \r0\(), 3 + vsetivli t0, 4, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 3 + vslidedown.vi v8, \r1\(), 5 + vsetivli t0, 5, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 4 + vslidedown.vi v8, \r0\(), 5 + vsetivli t0, 6, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 5 + vslidedown.vi v8, \r1\(), 7 + vsetivli t0, 7, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 6 + vslidedown.vi v8, \r0\(), 7 + vsetivli t0, 8, e16, m1, tu, ma + vslideup.vi \tmp3\(), v8, 7 + vsetivli t0, 1, e32, m1, tu, ma + vmv.v.v \tmp4\(), \tmp0\() + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp4\(), \tmp2\(), 1 + vslidedown.vi v8, \tmp0\(), 2 + vsetivli t0, 3, e32, m1, tu, ma + vslideup.vi \tmp4\(), v8, 2 + vslidedown.vi v8, \tmp2\(), 2 + vsetivli t0, 4, e32, m1, tu, ma + vslideup.vi \tmp4\(), v8, 3 + vsetivli t0, 1, e32, m1, tu, ma + vslidedown.vi v8, \tmp0\(), 1 + vmv.v.v \tmp5\(), v8 + vslidedown.vi v8, \tmp2\(), 1 + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp5\(), v8, 1 + vslidedown.vi v8, \tmp0\(), 3 + vsetivli t0, 3, e32, m1, tu, ma + vslideup.vi \tmp5\(), v8, 2 + vslidedown.vi v8, \tmp2\(), 3 + vsetivli t0, 4, e32, m1, tu, ma + vslideup.vi \tmp5\(), v8, 3 + vsetivli t0, 1, e32, m1, tu, ma + vmv.v.v \tmp0\(), \tmp1\() + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp0\(), \tmp3\(), 1 + vslidedown.vi v8, \tmp1\(), 2 + vsetivli t0, 3, e32, m1, tu, ma + vslideup.vi \tmp0\(), v8, 2 + vslidedown.vi v8, \tmp3\(), 2 + vsetivli t0, 4, e32, m1, tu, ma + vslideup.vi \tmp0\(), v8, 3 + vsetivli t0, 1, e32, m1, tu, ma + vslidedown.vi v8, \tmp1\(), 1 + vmv.v.v \tmp2\(), v8 + vslidedown.vi v8, \tmp3\(), 1 + vsetivli t0, 2, e32, m1, tu, ma + vslideup.vi \tmp2\(), v8, 1 + vslidedown.vi v8, \tmp1\(), 3 + vsetivli t0, 3, e32, m1, tu, ma + vslideup.vi \tmp2\(), v8, 2 + vslidedown.vi v8, \tmp3\(), 3 + vsetivli t0, 4, e32, m1, tu, ma + vslideup.vi \tmp2\(), v8, 3 + + vsetivli t0, 1, e64, m1, tu, ma + vslidedown.vi v8, \tmp4\(), 1 + vsetivli t0, 2, e64, m1, tu, ma + vslideup.vi \r3\(), v8, 1 + + vsetivli t0, 1, e64, m1, tu, ma + vslidedown.vi v8, \tmp5\(), 1 + vsetivli t0, 2, e64, m1, tu, ma + vslideup.vi \r1\(), v8, 1 + + vsetivli t0, 1, e64, m1, tu, ma + vslidedown.vi v8, \tmp0\(), 1 + vsetivli t0, 2, e64, m1, tu, ma + vslideup.vi \r2\(), v8, 1 + + vsetivli t0, 1, e64, m1, tu, ma + vslidedown.vi v8, \tmp2\(), 1 + vsetivli t0, 2, e64, m1, tu, ma + vslideup.vi \r0\(), v8, 1 +.endm + +.macro tr16_8x4 in0, in1, in2, in3, offset + tr_4x4_8 \in0, \in1, \in2, \in3, v24, v25, v26, v27 + + vsetivli t0, 1, e16, m1, tu, ma + vmv.x.s s2, v0 + vslidedown.vi v12, v0, 1 + vmv.x.s s3, v12 + vslidedown.vi v12, v0, 2 + vmv.x.s s4, v12 + vslidedown.vi v12, v0, 3 + vmv.x.s s5, v12 + vslidedown.vi v12, v0, 4 + vmv.x.s s6, v12 + vslidedown.vi v12, v0, 5 + vmv.x.s s7, v12 + vslidedown.vi v12, v0, 6 + vmv.x.s s8, v12 + vslidedown.vi v12, v0, 7 + vmv.x.s s9, v12 + + vsetivli t0, 4, e16, mf2, tu, ma + vslidedown.vi v8, \in0, 4 + vwmul.vx v28, v8, s6 + vslidedown.vi v8, \in0, 4 + vwmul.vx v29, v8, s7 + vslidedown.vi v8, \in0, 4 + vwmul.vx v30, v8, s8 + vslidedown.vi v8, \in0, 4 + vwmul.vx v31, v8, s9 + + sum_sub v28, \in1, s7, +, 2 + sum_sub v29, \in1, s9, -, 2 + sum_sub v30, \in1, s6, -, 2 + sum_sub v31, \in1, s8, -, 2 + sum_sub v28, \in2, s8, +, 2 + sum_sub v29, \in2, s6, -, 2 + sum_sub v30, \in2, s9, +, 2 + sum_sub v31, \in2, s7, +, 2 + sum_sub v28, \in3, s9, +, 2 + sum_sub v29, \in3, s8, -, 2 + sum_sub v30, \in3, s7, +, 2 + sum_sub v31, \in3, s6, -, 2 + + butterfly v24, v28, v16, v23 + butterfly v25, v29, v17, v22 + butterfly v26, v30, v18, v21 + butterfly v27, v31, v19, v20 + + li a7, \offset + add a4, sp, a7 + + vsetivli t0, 4, e32, m1, tu, ma + vse32.v v16, (a4) + add a4, a4, 16 + vse32.v v17, (a4) + add a4, a4, 16 + vse32.v v18, (a4) + add a4, a4, 16 + vse32.v v19, (a4) + add a4, a4, 16 + + vse32.v v20, (a4) + add a4, a4, 16 + vse32.v v21, (a4) + add a4, a4, 16 + vse32.v v22, (a4) + add a4, a4, 16 + vse32.v v23, (a4) + add a4, a4, 16 + + add a4, a4, -64 +.endm + +.macro scale out0, out1, out2, out3, in0, in1, in2, in3, in4, in5, in6, in7, shift + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi \out0\(), \in0\(), \shift + vsetivli t0, 1, e64, m1, tu, ma + vmv.x.s a7, \out0\() + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi v8, \in1\(), \shift + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \out0\(), v8, a7 + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi \out1\(), \in2\(), \shift + vsetivli t0, 1, e64, m1, tu, ma + vmv.x.s a7, \out1\() + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi v8, \in3\(), \shift + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \out1\(), v8, a7 + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi \out2\(), \in4\(), \shift + vsetivli t0, 1, e64, m1, tu, ma + vmv.x.s a7, \out2\() + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi v8, \in5\(), \shift + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \out2\(), v8, a7 + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi \out3\(), \in6\(), \shift + vsetivli t0, 1, e64, m1, tu, ma + vmv.x.s a7, \out3\() + vsetivli t0, 4, e16, mf2, tu, ma + vnclip.wi v8, \in7\(), \shift + vsetivli t0, 2, e64, m1, tu, ma + vslide1up.vx \out3\(), v8, a7 + +.endm + +.macro tr_4x4_8 in0, in1, in2, in3, out0, out1, out2, out3, p1, p2 + vsetivli t0, 4, e16, m1, tu, ma + vwcvt.x.x.v v8, \in0 + vsetivli t0, 4, e32, m1, tu, ma + vsll.vi v28, v8, 6 + + vsetivli t0, 16, e8, m1, tu, ma + vmv.v.v v29, v28 + + vsetivli t0, 1, e16, m1, tu, ma + vmv.x.s s2, v0 + vslidedown.vi v12, v0, 1 + vmv.x.s s3, v12 + vslidedown.vi v12, v0, 3 + vmv.x.s s5, v12 + vsetivli t0, 4, e16, mf2, tu, ma + vwmul.vx v30, \in1, s3 + vwmul.vx v31, \in1, s5 + vwmacc.vx v28, s2, \in2 + neg s2, s2 + vwmacc.vx v29, s2, \in2 + neg s2, s2 + vwmacc.vx v30, s5, \in3 + neg s3, s3 + vwmacc.vx v31, s3, \in3 + neg s3, s3 + + vsetivli t0, 4, e32, m1, tu, ma + vadd.vv \out0, v28, v30 + vadd.vv \out1, v29, v31 + vsub.vv \out2, v29, v31 + vsub.vv \out3, v28, v30 +.endm + +.macro tr_16x4 name, shift, offset, step +func func_tr_16x4_\name, zve64x + mv a1, a5 + addi a3, a5, \step * 64 + li a2, \step * 128 + + load16 v16, v17, v18, v19 + + lla a1, trans + + vsetivli t0, 8, e16, m1, tu, ma + vle16.v v0, (a1) + + tr16_8x4 v16, v17, v18, v19, \offset + + addi a1, a5, \step * 32 + addi a3, a5, \step * 3 *32 + + li a2, \step * 128 + + load16 v20, v17, v18, v19 + + lla a1, trans + addi a1, a1, 16 + + vsetivli t0, 8, e16, m1, tu, ma + vle16.v v1, (a1) + + vsetivli t0, 1, e16, m1, tu, ma + vmv.x.s s2, v1 + vslidedown.vi v12, v1, 1 + vmv.x.s s3, v12 + vslidedown.vi v12, v1, 2 + vmv.x.s s4, v12 + vslidedown.vi v12, v1, 3 + vmv.x.s s5, v12 + vslidedown.vi v12, v1, 4 + vmv.x.s s6, v12 + vslidedown.vi v12, v1, 5 + vmv.x.s s7, v12 + vslidedown.vi v12, v1, 6 + vmv.x.s s8, v12 + vslidedown.vi v12, v1, 7 + vmv.x.s s9, v12 + + vsetivli t0, 4, e16, mf2, tu, ma + vwmul.vx v21, v20, s2 + vwmul.vx v22, v20, s3 + vwmul.vx v23, v20, s4 + vwmul.vx v24, v20, s5 + vwmul.vx v25, v20, s6 + vwmul.vx v26, v20, s7 + vwmul.vx v27, v20, s8 + vwmul.vx v28, v20, s9 + + vsetivli t0, 1, e16, m1, tu, ma + vmv.x.s s2, v1 + vslidedown.vi v12, v1, 1 + vmv.x.s s3, v12 + vslidedown.vi v12, v1, 2 + vmv.x.s s4, v12 + vslidedown.vi v12, v1, 3 + vmv.x.s s5, v12 + vslidedown.vi v12, v1, 4 + vmv.x.s s6, v12 + vslidedown.vi v12, v1, 5 + vmv.x.s s7, v12 + vslidedown.vi v12, v1, 6 + vmv.x.s s8, v12 + vslidedown.vi v12, v1, 7 + vmv.x.s s9, v12 + + add_member v20, s3, s6, s9, s7, s4, s2, s5, s8, +, +, +, -, -, -, -, -, 2 + add_member v17, s4, s9, s5, s3, s8, s6, s2, s7, +, +, -, -, -, +, +, + + add_member v17, s5, s7, s3, s9, s2, s8, s4, s6, +, -, -, +, +, +, -, -, 2 + add_member v18, s6, s4, s8, s2, s9, s3, s7, s5, +, -, -, +, -, -, +, + + add_member v18, s7, s2, s6, s8, s3, s5, s9, s4, +, -, +, +, -, +, +, -, 2 + add_member v19, s8, s5, s2, s4, s7, s9, s6, s3, +, -, +, -, +, +, -, + + add_member v19, s9, s8, s7, s6, s5, s4, s3, s2, +, -, +, -, +, -, +, -, 2 + + li a7, \offset + add a4, sp, a7 + + vsetivli t0, 4, e32, m1, tu, ma + vle32.v v16, (a4) + addi a4, a4, 16 + vle32.v v17, (a4) + addi a4, a4, 16 + vle32.v v18, (a4) + addi a4, a4, 16 + vle32.v v19, (a4) + addi a4, a4, 16 + + butterfly16 v16, v21, v17, v22, v18, v23, v19, v24 + .if \shift > 0 + scale v29, v30, v31, v24, v20, v16, v21, v17, v22, v18, v23, v19, \shift + + transpose16_4x4_2 v29, v30, v31, v24, v2, v3, v4, v5, v6, v7 + + mv a1, a6 + addi a3, a6, 24 +3*32 + li a2, 32 + li a4, -32 + + store16 v29, v30, v31, v24, a4 + .else + store_to_stack \offset, (\offset + 240), v20, v21, v22, v23, v19, v18, v17, v16 + .endif + + li a7, \offset+64 + add a4, sp, a7 + + vsetivli t0, 4, e32, m1, tu, ma + vle32.v v16, (a4) + addi a4, a4, 16 + vle32.v v17, (a4) + addi a4, a4, 16 + vle32.v v18, (a4) + addi a4, a4, 16 + vle32.v v19, (a4) + addi a4, a4, 16 + + butterfly16 v16, v25, v17, v26, v18, v27, v19, v28 + .if \shift > 0 + scale v29, v30, v31, v20, v20, v16, v25, v17, v26, v18, v27, v19, \shift + transpose16_4x4_2 v29, v30, v31, v20, v2, v3, v4, v5, v6, v7 + + add a1, a6, 8 + add a3, a6, (16 + 3 * 32) + li a2, 32 + li a4, -32 + store16 v29, v30, v31, v20, a4 + .else + store_to_stack (\offset + 64), (\offset + 176), v20, v25, v26, v27, v19, v18, v17, v16 + .endif + + ret +endfunc +.endm + +tr_16x4 noscale, 0, 2048, 4 + +.macro tr_32x4 name, shift +func func_tr_32x4_\name, zve64x + mv t3, ra + jal func_tr_16x4_noscale + + load32 + + lla t2, trans + addi t2, t2, 32 + + vsetivli t0, 4, e16, m1, tu, ma + vle16.v v0, (t2) + addi t2, t2, 2*4 + vle16.v v1, (t2) + addi t2, t2, 2*4 + vle16.v v2, (t2) + addi t2, t2, 2*4 + vle16.v v3, (t2) + addi t2, t2, -2*4 + + li a7, 2048 + add a4, sp, a7 + + li a2, 64 + li t1, -64 + + jal tr_block1 + mv a1, t4 + addi a3, t4, (56 + 3 * 64) + scale_store \shift + + jal tr_block2 + addi a1, t4, 8 + addi a3, t4, (48 + 3 * 64) + scale_store \shift + + jal tr_block3 + addi a1, t4, 16 + addi a3, t4, (40 + 3 * 64) + scale_store \shift + + jal tr_block4 + addi a1, t4, 24 + addi a3, t4, (32 + 3 * 64) + scale_store \shift + + jr t3 +endfunc +.endm + +tr_32x4 firstpass, 7 +tr_32x4 secondpass_8, 20 - 8 + +.macro idct_32x32 bitdepth +func ff_hevc_idct_32x32_\bitdepth\()_rvv, zve64x + mv t6, ra + addi sp,sp,-8*13 + sd ra,8*12(sp) + sd s2,8*9(sp) + sd s3,8*8(sp) + sd s4,8*7(sp) + sd s5,8*6(sp) + sd s6,8*5(sp) + sd s7,8*4(sp) + sd s8,8*3(sp) + sd s9,8*2(sp) + sd s10,8*1(sp) + sd s11,8*0(sp) + + csrwi vxrm, 1 + li a7, 2432 + sub sp, sp, a7 + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + li a7, 8 * \i + add a5, a0, a7 + + li a7, 8 * \i * 32 + add t4, sp, a7 + jal func_tr_32x4_firstpass +.endr + +.irp i, 0, 1, 2, 3, 4, 5, 6, 7 + addi a5, sp, 8 * \i + addi t4, a0, 8 * \i * 32 + jal func_tr_32x4_secondpass_\bitdepth +.endr + + li a7, 2432 + add sp, sp, a7 + ld ra,8*12(sp) + ld s2,8*9(sp) + ld s3,8*8(sp) + ld s4,8*7(sp) + ld s5,8*6(sp) + ld s6,8*5(sp) + ld s7,8*4(sp) + ld s8,8*3(sp) + ld s9,8*2(sp) + ld s10,8*1(sp) + ld s11,8*0(sp) + addi sp,sp,8*13 + + jr t6 +endfunc +.endm + +idct_32x32 8 diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c index 1d8326a573..6dfb889eec 100644 --- a/libavcodec/riscv/hevcdsp_init.c +++ b/libavcodec/riscv/hevcdsp_init.c @@ -27,6 +27,8 @@ #include "libavcodec/hevc/dsp.h" #include "libavcodec/riscv/h26x/h2656dsp.h" +void ff_hevc_idct_32x32_8_rvv(int16_t *coeffs, int col_limit); + #define RVV_FNASSIGN(member, v, h, fn, ext) \ member[1][v][h] = ff_h2656_put_pixels_##8_##ext; \ member[3][v][h] = ff_h2656_put_pixels_##8_##ext; \ @@ -40,27 +42,37 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int bit_depth) const int flags = av_get_cpu_flags(); int vlenb; - if (!(flags & AV_CPU_FLAG_RVV_I32) || !(flags & AV_CPU_FLAG_RVB)) - return; - vlenb = ff_get_rv_vlenb(); - if (vlenb >= 32) { - switch (bit_depth) { - case 8: - RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256); - RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256); - break; - default: - break; - } - } else if (vlenb >= 16) { - switch (bit_depth) { - case 8: - RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128); - RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128); - break; - default: - break; + + if (flags & AV_CPU_FLAG_RVV_I64) + if (vlenb >= 16) + switch (bit_depth) { + case 8: + c->idct[3] = ff_hevc_idct_32x32_8_rvv; + break; + default: + break; + } + + if ((flags & AV_CPU_FLAG_RVV_I32) && (flags & AV_CPU_FLAG_RVB)){ + if (vlenb >= 32) { + switch (bit_depth) { + case 8: + RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_256); + RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_256); + break; + default: + break; + } + } else if (vlenb >= 16) { + switch (bit_depth) { + case 8: + RVV_FNASSIGN(c->put_hevc_qpel, 0, 0, pel_pixels, rvv_128); + RVV_FNASSIGN(c->put_hevc_epel, 0, 0, pel_pixels, rvv_128); + break; + default: + break; + } } } #endif -- 2.25.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".