From: Zhanheng Yang <[email protected]>

Bench on A210 C908 core(VLEN 128).
put_hevc_epel_v4_8_c:                                  157.8 ( 1.00x)
put_hevc_epel_v4_8_rvv_i32:                             73.2 ( 2.16x)
put_hevc_epel_v6_8_c:                                  314.6 ( 1.00x)
put_hevc_epel_v6_8_rvv_i32:                            101.2 ( 3.11x)
put_hevc_epel_v8_8_c:                                  545.5 ( 1.00x)
put_hevc_epel_v8_8_rvv_i32:                            124.4 ( 4.39x)
put_hevc_epel_v12_8_c:                                1240.8 ( 1.00x)
put_hevc_epel_v12_8_rvv_i32:                           183.6 ( 6.76x)
put_hevc_epel_v16_8_c:                                2170.7 ( 1.00x)
put_hevc_epel_v16_8_rvv_i32:                           235.1 ( 9.23x)
put_hevc_epel_v24_8_c:                                4743.5 ( 1.00x)
put_hevc_epel_v24_8_rvv_i32:                           677.5 ( 7.00x)
put_hevc_epel_v32_8_c:                                8353.4 ( 1.00x)
put_hevc_epel_v32_8_rvv_i32:                           892.1 ( 9.36x)
put_hevc_epel_v48_8_c:                               18608.1 ( 1.00x)
put_hevc_epel_v48_8_rvv_i32:                          1956.1 ( 9.51x)
put_hevc_epel_v64_8_c:                               32934.3 ( 1.00x)
put_hevc_epel_v64_8_rvv_i32:                          3454.1 ( 9.53x)
put_hevc_epel_uni_v4_8_c:                              237.5 ( 1.00x)
put_hevc_epel_uni_v4_8_rvv_i32:                         87.5 ( 2.72x)
put_hevc_epel_uni_v6_8_c:                              509.5 ( 1.00x)
put_hevc_epel_uni_v6_8_rvv_i32:                        119.6 ( 4.26x)
put_hevc_epel_uni_v8_8_c:                              982.8 ( 1.00x)
put_hevc_epel_uni_v8_8_rvv_i32:                        147.1 ( 6.68x)
put_hevc_epel_uni_v12_8_c:                            2027.7 ( 1.00x)
put_hevc_epel_uni_v12_8_rvv_i32:                       211.0 ( 9.61x)
put_hevc_epel_uni_v16_8_c:                            3525.4 ( 1.00x)
put_hevc_epel_uni_v16_8_rvv_i32:                       278.8 (12.64x)
put_hevc_epel_uni_v24_8_c:                            7804.3 ( 1.00x)
put_hevc_epel_uni_v24_8_rvv_i32:                       778.9 (10.02x)
put_hevc_epel_uni_v32_8_c:                           13807.3 ( 1.00x)
put_hevc_epel_uni_v32_8_rvv_i32:                      1028.7 (13.42x)
put_hevc_epel_uni_v48_8_c:                           30934.9 ( 1.00x)
put_hevc_epel_uni_v48_8_rvv_i32:                      2265.1 (13.66x)
put_hevc_epel_uni_v64_8_c:                           54705.5 ( 1.00x)
put_hevc_epel_uni_v64_8_rvv_i32:                      4003.7 (13.66x)
put_hevc_epel_uni_w_v4_8_c:                            313.8 ( 1.00x)
put_hevc_epel_uni_w_v4_8_rvv_i32:                      156.6 ( 2.00x)
put_hevc_epel_uni_w_v6_8_c:                            674.3 ( 1.00x)
put_hevc_epel_uni_w_v6_8_rvv_i32:                      222.8 ( 3.03x)
put_hevc_epel_uni_w_v8_8_c:                           1253.3 ( 1.00x)
put_hevc_epel_uni_w_v8_8_rvv_i32:                      279.4 ( 4.49x)
put_hevc_epel_uni_w_v12_8_c:                          2619.4 ( 1.00x)
put_hevc_epel_uni_w_v12_8_rvv_i32:                     410.2 ( 6.39x)
put_hevc_epel_uni_w_v16_8_c:                          4614.2 ( 1.00x)
put_hevc_epel_uni_w_v16_8_rvv_i32:                     535.8 ( 8.61x)
put_hevc_epel_uni_w_v24_8_c:                         10290.6 ( 1.00x)
put_hevc_epel_uni_w_v24_8_rvv_i32:                    1550.6 ( 6.64x)
put_hevc_epel_uni_w_v32_8_c:                         18169.4 ( 1.00x)
put_hevc_epel_uni_w_v32_8_rvv_i32:                    2047.2 ( 8.88x)
put_hevc_epel_uni_w_v48_8_c:                         40704.3 ( 1.00x)
put_hevc_epel_uni_w_v48_8_rvv_i32:                    4552.4 ( 8.94x)
put_hevc_epel_uni_w_v64_8_c:                         72197.1 ( 1.00x)
put_hevc_epel_uni_w_v64_8_rvv_i32:                    8069.4 ( 8.95x)
put_hevc_epel_bi_v4_8_c:                               262.7 ( 1.00x)
put_hevc_epel_bi_v4_8_rvv_i32:                         105.9 ( 2.48x)
put_hevc_epel_bi_v6_8_c:                               553.0 ( 1.00x)
put_hevc_epel_bi_v6_8_rvv_i32:                         145.4 ( 3.80x)
put_hevc_epel_bi_v8_8_c:                              1045.5 ( 1.00x)
put_hevc_epel_bi_v8_8_rvv_i32:                         180.3 ( 5.80x)
put_hevc_epel_bi_v12_8_c:                             2172.7 ( 1.00x)
put_hevc_epel_bi_v12_8_rvv_i32:                        264.2 ( 8.22x)
put_hevc_epel_bi_v16_8_c:                             3791.6 ( 1.00x)
put_hevc_epel_bi_v16_8_rvv_i32:                        336.5 (11.27x)
put_hevc_epel_bi_v24_8_c:                             8424.1 ( 1.00x)
put_hevc_epel_bi_v24_8_rvv_i32:                        967.2 ( 8.71x)
put_hevc_epel_bi_v32_8_c:                            14910.8 ( 1.00x)
put_hevc_epel_bi_v32_8_rvv_i32:                       1270.7 (11.73x)
put_hevc_epel_bi_v48_8_c:                            33326.5 ( 1.00x)
put_hevc_epel_bi_v48_8_rvv_i32:                       2804.7 (11.88x)
put_hevc_epel_bi_v64_8_c:                            59177.9 ( 1.00x)
put_hevc_epel_bi_v64_8_rvv_i32:                       5022.3 (11.78x)

Signed-off-by: Zhanheng Yang <[email protected]>
---
 libavcodec/riscv/h26x/h2656dsp.h     |  11 ++
 libavcodec/riscv/h26x/hevcepel_rvv.S | 235 ++++++++++++++++++++++++++-
 libavcodec/riscv/hevcdsp_init.c      |   4 +
 3 files changed, 249 insertions(+), 1 deletion(-)

diff --git a/libavcodec/riscv/h26x/h2656dsp.h b/libavcodec/riscv/h26x/h2656dsp.h
index fa2f5a88e3..085ed4cf14 100644
--- a/libavcodec/riscv/h26x/h2656dsp.h
+++ b/libavcodec/riscv/h26x/h2656dsp.h
@@ -59,4 +59,15 @@ void ff_hevc_put_epel_uni_w_h_8_m1_rvv(uint8_t *_dst,  
ptrdiff_t _dststride,
 void ff_hevc_put_epel_bi_h_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
         ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
         mx, intptr_t my, int width);
+void ff_hevc_put_epel_v_8_m1_rvv(int16_t *dst, const uint8_t *_src, ptrdiff_t 
_srcstride, int height,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, 
const uint8_t *_src,
+        ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_uni_w_v_8_m1_rvv(uint8_t *_dst,  ptrdiff_t _dststride,
+        const uint8_t *_src, ptrdiff_t _srcstride,
+        int height, int denom, int wx, int ox,
+        intptr_t mx, intptr_t my, int width);
+void ff_hevc_put_epel_bi_v_8_m1_rvv(uint8_t *_dst, ptrdiff_t _dststride, const 
uint8_t *_src,
+        ptrdiff_t _srcstride, const int16_t *src2, int height, intptr_t
+        mx, intptr_t my, int width);
 #endif
diff --git a/libavcodec/riscv/h26x/hevcepel_rvv.S 
b/libavcodec/riscv/h26x/hevcepel_rvv.S
index 81044846f7..caca0b88ab 100644
--- a/libavcodec/riscv/h26x/hevcepel_rvv.S
+++ b/libavcodec/riscv/h26x/hevcepel_rvv.S
@@ -262,4 +262,237 @@ func ff_hevc_put_epel_bi_h_8_\lmul\()_rvv, zve32x
 endfunc
 .endm
 
-hevc_epel_h m1, m2, m4
\ No newline at end of file
+hevc_epel_h m1, m2, m4
+
+/* output is unclipped; clobbers v4 */
+.macro filter_v         vdst, vsrc0, vsrc1, vsrc2, vsrc3
+        vmv.v.x          v4, s1
+        vwmulsu.vv       \vdst, v4, \vsrc0
+        vwmaccsu.vx      \vdst, s2, \vsrc1
+        vmv.v.v          \vsrc0, \vsrc1
+        vwmaccsu.vx      \vdst, s3, \vsrc2
+        vmv.v.v          \vsrc1, \vsrc2
+        vwmaccsu.vx      \vdst, s4, \vsrc3
+        vmv.v.v          \vsrc2, \vsrc3
+.endm
+
+.macro hevc_epel_v       lmul, lmul2, lmul4
+func ff_hevc_put_epel_v_8_\lmul\()_rvv, zve32x
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a5
+    sub         a1, a1, a2      # src - src_stride
+    li          t1, 0           # offset   
+    mv          t4, a3 
+
+1:
+    add         t2, a1, t1
+    slli        t3, t1, 1
+    add         t3, a0, t3
+
+    vsetvli     t5, a6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a2
+    vle8.V      v18, (t2)
+    add         t2, t2, a2
+    vle8.V      v20, (t2)
+    add         t2, t2, a2
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v22, (t2)
+    add         t2, t2, a2
+    filter_v    v0, v16, v18, v20, v22
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vse16.v     v0, (t3)
+    add         t3, t3, 2*HEVC_MAX_PB_SIZE
+    addi        a3, a3, -1
+    bgt         a3, zero, 2b    
+    add         t1, t1, t5
+    sub         a6, a6, t5
+    mv          a3, t4
+    bgt         a6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_uni_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0 
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a6
+    sub         a2, a2, a3      # src - src_stride
+    li          t1, 0           # offset   
+    mv          t4, a4 
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+
+    vsetvli     t5, a7, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 6
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a4, a4, -1
+    bgt         a4, zero, 2b    
+    add         t1, t1, t5
+    sub         a7, a7, t5
+    mv          a4, t4
+    bgt         a7, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_uni_w_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0 
+#if (__riscv_xlen == 32)
+    lw          t1, 4(sp)       # my
+    lw          t6, 8(sp)       # width
+#elif (__riscv_xlen == 64)
+    ld          t1, 8(sp)
+    lw          t6, 16(sp)
+#endif
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter t1
+    addi        a5, a5, 6       # shift
+    sub         a2, a2, a3      # src - src_stride
+    li          t1, 0           # offset   
+    mv          t4, a4 
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+
+    vsetvli     t5, t6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vwmul.vx    v8, v0, a6
+    vsetvli     zero, zero, e32, \lmul4, ta, ma
+    vssra.vx    v0, v8, a5
+    vsadd.vx    v0, v0, a7
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vnclip.wi   v0, v0, 0
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 0
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a4, a4, -1
+    bgt         a4, zero, 2b    
+    add         t1, t1, t5
+    sub         t6, t6, t5
+    mv          a4, t4
+    bgt         t6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+
+func ff_hevc_put_epel_bi_v_8_\lmul\()_rvv, zve32x
+    csrwi       vxrm, 0 
+    lw          t6, 0(sp)      # width
+    addi        sp, sp, -32
+    sx          s1, 0(sp)
+    sx          s2, 8(sp)
+    sx          s3, 16(sp)
+    sx          s4, 24(sp)
+    load_filter a7
+    sub         a2, a2, a3      # src - src_stride
+    li          t1, 0           # offset   
+    mv          t4, a5 
+
+1:
+    add         t2, a2, t1
+    add         t3, a0, t1
+    slli        t0, t1, 1
+    add         t0, a4, t0
+
+    vsetvli     t5, t6, e8, \lmul, ta, ma
+    vle8.V      v16, (t2)
+    add         t2, t2, a3
+    vle8.V      v18, (t2)
+    add         t2, t2, a3
+    vle8.V      v20, (t2)
+    add         t2, t2, a3
+
+2:
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vle8.V      v22, (t2)
+    add         t2, t2, a3
+    filter_v    v0, v16, v18, v20, v22
+    vsetvli     zero, zero, e16, \lmul2, ta, ma
+    vle16.v     v8, (t0)
+    addi        t0, t0, 2*HEVC_MAX_PB_SIZE
+    vsadd.vv    v0, v0, v8
+    vmax.vx     v0, v0, zero
+    vsetvli     zero, zero, e8, \lmul, ta, ma
+    vnclipu.wi  v0, v0, 7
+    vse8.v      v0, (t3)
+    add         t3, t3, a1
+    addi        a5, a5, -1
+    bgt         a5, zero, 2b
+    add         t1, t1, t5
+    sub         t6, t6, t5
+    mv          a5, t4
+    bgt         t6, zero, 1b
+
+    lx          s1, 0(sp)
+    lx          s2, 8(sp)
+    lx          s3, 16(sp)
+    lx          s4, 24(sp)
+    addi        sp, sp, 32
+    ret
+endfunc
+.endm
+
+hevc_epel_v m1, m2, m4
\ No newline at end of file
diff --git a/libavcodec/riscv/hevcdsp_init.c b/libavcodec/riscv/hevcdsp_init.c
index 8608fdbd19..c7874996a8 100644
--- a/libavcodec/riscv/hevcdsp_init.c
+++ b/libavcodec/riscv/hevcdsp_init.c
@@ -94,6 +94,10 @@ void ff_hevc_dsp_init_riscv(HEVCDSPContext *c, const int 
bit_depth)
                 RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 0, 1, 
ff_hevc_put_epel_uni_h_8_m1_rvv);
                 RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 0, 1, 
ff_hevc_put_epel_uni_w_h_8_m1_rvv);
                 RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 0, 1, 
ff_hevc_put_epel_bi_h_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel, 1, 0, 
ff_hevc_put_epel_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_uni, 1, 0, 
ff_hevc_put_epel_uni_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_uni_w, 1, 0, 
ff_hevc_put_epel_uni_w_v_8_m1_rvv);
+                RVV_FNASSIGN_PEL(c->put_hevc_epel_bi, 1, 0, 
ff_hevc_put_epel_bi_v_8_m1_rvv);
                 break;
             default:
                 break;
-- 
2.25.1

_______________________________________________
ffmpeg-devel mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to