riscv: add RVV optimized for qpel_v in HEVC.

zhanheng.yang--- via ffmpeg-devel Sat, 24 Jan 2026 07:57:10 -0800

From: Zhanheng Yang <[email protected]>

Bench on A210 C908 core(VLEN 128).
put_hevc_qpel_v4_8_c:                                  265.0 ( 1.00x)
put_hevc_qpel_v4_8_rvv_i32:                            117.0 ( 2.26x)
put_hevc_qpel_v6_8_c:                                  568.8 ( 1.00x)
put_hevc_qpel_v6_8_rvv_i32:                            162.3 ( 3.50x)
put_hevc_qpel_v8_8_c:                                  986.9 ( 1.00x)
put_hevc_qpel_v8_8_rvv_i32:                            200.9 ( 4.91x)
put_hevc_qpel_v12_8_c:                                2236.1 ( 1.00x)
put_hevc_qpel_v12_8_rvv_i32:                           294.8 ( 7.58x)
put_hevc_qpel_v16_8_c:                                3958.8 ( 1.00x)
put_hevc_qpel_v16_8_rvv_i32:                           387.0 (10.23x)
put_hevc_qpel_v24_8_c:                                8707.6 ( 1.00x)
put_hevc_qpel_v24_8_rvv_i32:                          1096.5 ( 7.94x)
put_hevc_qpel_v32_8_c:                               15392.3 ( 1.00x)
put_hevc_qpel_v32_8_rvv_i32:                          1442.4 (10.67x)
put_hevc_qpel_v48_8_c:                               34569.2 ( 1.00x)
put_hevc_qpel_v48_8_rvv_i32:                          3197.1 (10.81x)
put_hevc_qpel_v64_8_c:                               61109.7 ( 1.00x)
put_hevc_qpel_v64_8_rvv_i32:                          5642.4 (10.83x)
put_hevc_qpel_uni_v4_8_c:                              354.9 ( 1.00x)
put_hevc_qpel_uni_v4_8_rvv_i32:                        131.3 ( 2.70x)
put_hevc_qpel_uni_v6_8_c:                              769.3 ( 1.00x)
put_hevc_qpel_uni_v6_8_rvv_i32:                        180.8 ( 4.25x)
put_hevc_qpel_uni_v8_8_c:                             1399.3 ( 1.00x)
put_hevc_qpel_uni_v8_8_rvv_i32:                        223.6 ( 6.26x)
put_hevc_qpel_uni_v12_8_c:                            3031.4 ( 1.00x)
put_hevc_qpel_uni_v12_8_rvv_i32:                       323.2 ( 9.38x)
put_hevc_qpel_uni_v16_8_c:                            5334.2 ( 1.00x)
put_hevc_qpel_uni_v16_8_rvv_i32:                       417.9 (12.76x)
put_hevc_qpel_uni_v24_8_c:                           11908.4 ( 1.00x)
put_hevc_qpel_uni_v24_8_rvv_i32:                      1212.2 ( 9.82x)
put_hevc_qpel_uni_v32_8_c:                           21030.6 ( 1.00x)
put_hevc_qpel_uni_v32_8_rvv_i32:                      1579.5 (13.31x)
put_hevc_qpel_uni_v48_8_c:                           47025.7 ( 1.00x)
put_hevc_qpel_uni_v48_8_rvv_i32:                      3500.2 (13.43x)
put_hevc_qpel_uni_v64_8_c:                           83487.0 ( 1.00x)
put_hevc_qpel_uni_v64_8_rvv_i32:                      6188.4 (13.49x)
put_hevc_qpel_uni_w_v4_8_c:                            396.3 ( 1.00x)
put_hevc_qpel_uni_w_v4_8_rvv_i32:                      200.9 ( 1.97x)
put_hevc_qpel_uni_w_v6_8_c:                            851.4 ( 1.00x)
put_hevc_qpel_uni_w_v6_8_rvv_i32:                      282.1 ( 3.02x)
put_hevc_qpel_uni_w_v8_8_c:                           1544.0 ( 1.00x)
put_hevc_qpel_uni_w_v8_8_rvv_i32:                      356.5 ( 4.33x)
put_hevc_qpel_uni_w_v12_8_c:                          3329.0 ( 1.00x)
put_hevc_qpel_uni_w_v12_8_rvv_i32:                     519.6 ( 6.41x)
put_hevc_qpel_uni_w_v16_8_c:                          5857.9 ( 1.00x)
put_hevc_qpel_uni_w_v16_8_rvv_i32:                     679.6 ( 8.62x)
put_hevc_qpel_uni_w_v24_8_c:                         13050.5 ( 1.00x)
put_hevc_qpel_uni_w_v24_8_rvv_i32:                    1965.5 ( 6.64x)
put_hevc_qpel_uni_w_v32_8_c:                         23219.4 ( 1.00x)
put_hevc_qpel_uni_w_v32_8_rvv_i32:                    2601.6 ( 8.93x)
put_hevc_qpel_uni_w_v48_8_c:                         51925.3 ( 1.00x)
put_hevc_qpel_uni_w_v48_8_rvv_i32:                    5786.7 ( 8.97x)
put_hevc_qpel_uni_w_v64_8_c:                         92075.5 ( 1.00x)
put_hevc_qpel_uni_w_v64_8_rvv_i32:                   10269.8 ( 8.97x)
put_hevc_qpel_bi_v4_8_c:                               376.4 ( 1.00x)
put_hevc_qpel_bi_v4_8_rvv_i32:                         150.2 ( 2.51x)
put_hevc_qpel_bi_v6_8_c:                               808.3 ( 1.00x)
put_hevc_qpel_bi_v6_8_rvv_i32:                         207.1 ( 3.90x)
put_hevc_qpel_bi_v8_8_c:                              1490.1 ( 1.00x)
put_hevc_qpel_bi_v8_8_rvv_i32:                         257.2 ( 5.79x)
put_hevc_qpel_bi_v12_8_c:                             3220.3 ( 1.00x)
put_hevc_qpel_bi_v12_8_rvv_i32:                        375.2 ( 8.58x)
put_hevc_qpel_bi_v16_8_c:                             5657.5 ( 1.00x)
put_hevc_qpel_bi_v16_8_rvv_i32:                        482.5 (11.72x)
put_hevc_qpel_bi_v24_8_c:                            12495.4 ( 1.00x)
put_hevc_qpel_bi_v24_8_rvv_i32:                       1383.8 ( 9.03x)
put_hevc_qpel_bi_v32_8_c:                            22191.6 ( 1.00x)
put_hevc_qpel_bi_v32_8_rvv_i32:                       1822.0 (12.18x)
put_hevc_qpel_bi_v48_8_c:                            49654.0 ( 1.00x)
put_hevc_qpel_bi_v48_8_rvv_i32:                       4046.8 (12.27x)
put_hevc_qpel_bi_v64_8_c:                            88287.8 ( 1.00x)
put_hevc_qpel_bi_v64_8_rvv_i32:                       7196.6 (12.27x)


Signed-off-by: Zhanheng Yang <[email protected]>
---
 libavcodec/riscv/h26x/h2656dsp.h     |  11 +
 libavcodec/riscv/h26x/hevcqpel_rvv.S | 315 ++++++++++++++++++++++++++-
 libavcodec/riscv/hevcdsp_init.c      |   5 +
 3 files changed, 330 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index 028b9ffbfd..2dabc16aee 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -36,4 +36,15 @@ void ff_hevc_put_qpel_uni_w_h_8_m1_rvv(uint8_t *_dst,  
ptrdiff_t _dststride,
 void ff_hevc_put_qpel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
         mx, intptr_t my, int width);
+void ff_hevc_put_qpel_v_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src,
+        ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_uni_w_v_8_m1_rvv(uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_qpel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
+        ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+        mx, intptr_t my, int width);
 #endif
diff --git a/libavcodec/riscv/h26x/hevcqpel_rvv.S 
b/libavcodec/riscv/h26x/hevcqpel_rvv.S
index 52d7acac33..8fd3c47bcc 100644
--- a/libavcodec/riscv/h26x/hevcqpel_rvv.S
+++ b/libavcodec/riscv/h26x/hevcqpel_rvv.S
@@ -306,4 +306,317 @@ func ff_hevc_put_qpel_bi_h_8_\lmul\()_rvv, zve32x
 endfunc
 .endm
 
-hevc_qpel_h m1, m2, m4
\ No newline at end of file
+hevc_qpel_h m1, m2, m4
+
+/* output is unclipped; clobbers v4 */
+.macro filter_v         vdst, vsrc0, vsrc1, vsrc2, vsrc3, vsrc4, vsrc5, vsrc6, 
vsrc7
+        vmv.v.x          v4, s1
+        vwmulsu.vv       \vdst, v4, \vsrc0
+        vwmaccsu.vx      \vdst, s2, \vsrc1
+        vmv.v.v          \vsrc0, \vsrc1
+        vwmaccsu.vx      \vdst, s3, \vsrc2
+        vmv.v.v          \vsrc1, \vsrc2
+        vwmaccsu.vx      \vdst, s4, \vsrc3
+        vmv.v.v          \vsrc2, \vsrc3
+        vwmaccsu.vx      \vdst, s5, \vsrc4
+        vmv.v.v          \vsrc3, \vsrc4
+        vwmaccsu.vx      \vdst, s6, \vsrc5
+        vmv.v.v          \vsrc4, \vsrc5
+        vwmaccsu.vx      \vdst, s7, \vsrc6
+        vmv.v.v          \vsrc5, \vsrc6
+        vwmaccsu.vx      \vdst, s8, \vsrc7
+        vmv.v.v          \vsrc6, \vsrc7
+.endm
+
+.macro hevc_qpel_v       lmul, lmul2, lmul4
+func ff_hevc_put_qpel_v_8_\lmul\()_rvv, zve32x
+    addi        sp, sp, -64
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    sx          s5, 32(sp)
+    sx          s6, 40(sp)
+    sx          s7, 48(sp)
+    sx          s8, 56(sp)
+    load_filter a5
+    slli        t1, a2, 1
+    add         t1, t1, a2
+    sub         a1, a1, t1      # src - 3 * src_stride
+    li          t1, 0           # offset
+    mv          t4, a3
+
+1:
+    add         t2, a1, t1
+    slli        t3, t1, 1
+    add         t3, a0, t3
+
+    vsetvli     t5, a6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a2
+    vle8.V      v18, (t2)
+    add         t2, t2, a2
+    vle8.V      v20, (t2)
+    add         t2, t2, a2
+    vle8.V      v22, (t2)
+    add         t2, t2, a2
+    vle8.V      v24, (t2)
+    add         t2, t2, a2
+    vle8.V      v26, (t2)
+    add         t2, t2, a2
+    vle8.V      v28, (t2)
+    add         t2, t2, a2
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v30, (t2)
+    add         t2, t2, a2
+    filter_v    v0, v16, v18, v20, v22, v24, v26, v28, v30
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vse16.v     v0, (t3)
+    add         t3, t3, 2*HEVC_MAX_PB_SIZE
+    addi        a3, a3, -1
+    bgt         a3, zero, 2b
+    add         t1, t1, t5
+    sub         a6, a6, t5
+    mv          a3, t4
+    bgt         a6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    lx          s5, 32(sp)
+    lx          s6, 40(sp)
+    lx          s7, 48(sp)
+    lx          s8, 56(sp)
+    addi        sp, sp, 64
+    ret
+endfunc
+
+func ff_hevc_put_qpel_uni_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+    addi        sp, sp, -64
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    sx          s5, 32(sp)
+    sx          s6, 40(sp)
+    sx          s7, 48(sp)
+    sx          s8, 56(sp)
+    load_filter a6
+    slli        t1, a3, 1
+    add         t1, t1, a3
+    sub         a2, a2, t1      # src - 3 * src_stride
+    li          t1, 0           # offset
+    mv          t4, a4
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+
+    vsetvli     t5, a7, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    vle8.V      v24, (t2)
+    add         t2, t2, a3
+    vle8.V      v26, (t2)
+    add         t2, t2, a3
+    vle8.V      v28, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v30, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22, v24, v26, v28, v30
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 6
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a4, a4, -1
+    bgt         a4, zero, 2b
+    add         t1, t1, t5
+    sub         a7, a7, t5
+    mv          a4, t4
+    bgt         a7, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    lx          s5, 32(sp)
+    lx          s6, 40(sp)
+    lx          s7, 48(sp)
+    lx          s8, 56(sp)
+    addi        sp, sp, 64
+    ret
+endfunc
+
+func ff_hevc_put_qpel_uni_w_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+#if (__riscv_xlen == 32)
+    lw          t1, 4(sp)       # my
+    lw          t6, 8(sp)       # width
+#elif (__riscv_xlen == 64)
+    ld          t1, 8(sp)
+    lw          t6, 16(sp)
+#endif
+    addi        sp, sp, -64
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    sx          s5, 32(sp)
+    sx          s6, 40(sp)
+    sx          s7, 48(sp)
+    sx          s8, 56(sp)
+    load_filter t1
+    addi        a5, a5, 6       # shift
+    slli        t1, a3, 1
+    add         t1, t1, a3
+    sub         a2, a2, t1      # src - 3 * src_stride
+    li          t1, 0           # offset
+    mv          t4, a4
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+
+    vsetvli     t5, t6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    vle8.V      v24, (t2)
+    add         t2, t2, a3
+    vle8.V      v26, (t2)
+    add         t2, t2, a3
+    vle8.V      v28, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v30, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22, v24, v26, v28, v30
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vwmul.vx    v8, v0, a6
+    vsetvli     zero, zero, e32, \lmul4, ta, ma
+    vssra.vx    v0, v8, a5
+    vsadd.vx    v0, v0, a7
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vnclip.wi   v0, v0, 0
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 0
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a4, a4, -1
+    bgt         a4, zero, 2b
+    add         t1, t1, t5
+    sub         t6, t6, t5
+    mv          a4, t4
+    bgt         t6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    lx          s5, 32(sp)
+    lx          s6, 40(sp)
+    lx          s7, 48(sp)
+    lx          s8, 56(sp)
+    addi        sp, sp, 64
+    ret
+endfunc
+
+func ff_hevc_put_qpel_bi_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0
+    lw          t6, 0(sp)      # width
+    addi        sp, sp, -64
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    sx          s5, 32(sp)
+    sx          s6, 40(sp)
+    sx          s7, 48(sp)
+    sx          s8, 56(sp)
+    load_filter a7
+    slli        t1, a3, 1
+    add         t1, t1, a3
+    sub         a2, a2, t1      # src - 3 * src_stride
+    li          t1, 0           # offset
+    mv          t4, a5
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+    slli        t0, t1, 1
+    add         t0, a4, t0
+
+    vsetvli     t5, t6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    vle8.V      v24, (t2)
+    add         t2, t2, a3
+    vle8.V      v26, (t2)
+    add         t2, t2, a3
+    vle8.V      v28, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v30, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22, v24, v26, v28, v30
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vle16.v     v8, (t0)
+    addi        t0, t0, 2*HEVC_MAX_PB_SIZE
+    vsadd.vv    v0, v0, v8
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 7
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a5, a5, -1
+    bgt         a5, zero, 2b
+    add         t1, t1, t5
+    sub         t6, t6, t5
+    mv          a5, t4
+    bgt         t6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    lx          s5, 32(sp)
+    lx          s6, 40(sp)
+    lx          s7, 48(sp)
+    lx          s8, 56(sp)
+    addi        sp, sp, 64
+    ret
+endfunc
+.endm
+
+hevc_qpel_v m1, m2, m4
\ No newline at end of file
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 59333740de..480cfd2968 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -84,6 +84,11 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int 
bit_depth)
                 RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni, 0, 1, 
ff_hevc_put_qpel_uni_h_8_m1_rvv);
                 RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 0, 1, 
ff_hevc_put_qpel_uni_w_h_8_m1_rvv);
                 RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 0, 1, 
ff_hevc_put_qpel_bi_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 0, 1, 
ff_hevc_put_qpel_bi_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_qpel, 1, 0, 
ff_hevc_put_qpel_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni, 1, 0, 
ff_hevc_put_qpel_uni_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_qpel_uni_w, 1, 0, 
ff_hevc_put_qpel_uni_w_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_qpel_bi, 1, 0, 
ff_hevc_put_qpel_bi_v_8_m1_rvv);
 
                 break;
             default:
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-devel] [PATCH 2/6] libavcodec/riscv: add RVV optimized for qpel_v in HEVC.

Reply via email to