ffmpeg | branch: master | Rémi Denis-Courmont <r...@remlab.net> | Sun Sep 1 15:47:26 2024 +0300| [4936bb25083ebdd7b0c514ab8b81159bb4273265] | committer: Rémi Denis-Courmont
lavc/h264dsp: optimise R-V V weight for shorter heights The height is a power of two of up to 16 rows. The current code was optimised for large sample counts. T-Head C908: h264_weight2_8_c: 211.7 ( 1.00x) h264_weight2_8_rvv_i32: before 184.0 ( 1.15x) h264_weight2_8_rvv_i32: after 54.2 ( 3.90x) h264_weight4_8_c: 285.7 ( 1.00x) h264_weight4_8_rvv_i32: before 341.2 ( 0.86x) h264_weight4_8_rvv_i32: after 82.2 ( 3.47x) h264_weight8_8_c: 498.7 ( 1.00x) h264_weight8_8_rvv_i32: before 683.7 ( 0.73x) h264_weight8_8_rvv_i64: after 128.5 ( 3.95x) h264_weight16_8_c: 878.2 ( 1.00x) h264_weight16_8_rvv_i32: unchanged 239.5 ( 3.67x) SpacemiT X60: h264_weight2_8_c: 207.2 ( 1.00x) h264_weight2_8_rvv_i32: before 259.6 ( 0.80x) h264_weight2_8_rvv_i32: after 82.2 ( 2.52x) h264_weight4_8_c: 290.8 ( 1.00x) h264_weight4_8_rvv_i32: before 509.6 ( 0.57x) h264_weight4_8_rvv_i32: after 61.5 ( 4.73x) h264_weight8_8_c: 498.8 ( 1.00x) h264_weight8_8_rvv_i32: before 1019.8 ( 0.49x) h264_weight8_8_rvv_i64: after 71.8 ( 6.95x) h264_weight16_8_c: 874.0 ( 1.00x) h264_weight16_8_rvv_i32: unchanged 249.0 ( 3.51x) > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=4936bb25083ebdd7b0c514ab8b81159bb4273265 --- libavcodec/riscv/h264dsp_init.c | 18 +++++++++--- libavcodec/riscv/h264dsp_rvv.S | 62 +++++++++++++++++++---------------------- 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 9ffc9b0333..6391667a40 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -96,13 +96,23 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, if (flags & AV_CPU_FLAG_RVV_I32) { const bool zvl128b = ff_rv_vlen_least(128); + if (bit_depth == 8) { + if (zvl128b) + dsp->weight_h264_pixels_tab[0] = + ff_h264_weight_funcs_8_rvv[0].weight; + if (flags & AV_CPU_FLAG_RVV_I64) + dsp->weight_h264_pixels_tab[1] = + ff_h264_weight_funcs_8_rvv[1].weight; + dsp->weight_h264_pixels_tab[2] = + ff_h264_weight_funcs_8_rvv[2].weight; + dsp->weight_h264_pixels_tab[3] = + ff_h264_weight_funcs_8_rvv[3].weight; + } + if (bit_depth == 8 && zvl128b) { - for (int i = 0; i < 4; i++) { - dsp->weight_h264_pixels_tab[i] = - ff_h264_weight_funcs_8_rvv[i].weight; + for (int i = 0; i < 4; i++) dsp->biweight_h264_pixels_tab[i] = ff_h264_weight_funcs_8_rvv[i].biweight; - } dsp->h264_v_loop_filter_luma = ff_h264_v_loop_filter_luma_8_rvv; dsp->h264_h_loop_filter_luma = ff_h264_h_loop_filter_luma_8_rvv; diff --git a/libavcodec/riscv/h264dsp_rvv.S b/libavcodec/riscv/h264dsp_rvv.S index 422ac02222..b081e156a0 100644 --- a/libavcodec/riscv/h264dsp_rvv.S +++ b/libavcodec/riscv/h264dsp_rvv.S @@ -28,11 +28,12 @@ #include "libavutil/riscv/asm.S" + .variant_cc ff_h264_weight_pixels_simple_8_rvv func ff_h264_weight_pixels_simple_8_rvv, zve32x csrwi vxrm, 0 sll a5, a5, a3 1: - vsetvli zero, a6, e16, m2, ta, ma + vsetvli zero, t6, e16, m2, ta, ma vle8.v v8, (a0) addi a2, a2, -1 vzext.vf2 v24, v8 @@ -76,38 +77,36 @@ func ff_h264_biweight_pixels_simple_8_rvv, zve32x ret endfunc -func ff_h264_weight_pixels_8_rvv, zve32x +.macro h264_weight depth, w, b= +func ff_h264_weight_pixels\w\()_\depth\()_rvv, zve64x + lpad 0 + .ifb \b + li t6, \w + j ff_h264_weight_pixels_simple_\depth\()_rvv + .else csrwi vxrm, 0 sll a5, a5, a3 1: - mv t0, a0 - mv t6, a6 -2: - vsetvli t2, a2, e16, m8, ta, ma - vlsseg2e8.v v0, (t0), a1 - addi t6, t6, -2 - vzext.vf2 v16, v0 - vzext.vf2 v24, v4 - vmul.vx v16, v16, a4 - vmul.vx v24, v24, a4 + vsetvli t1, a2, e\b, m2, ta, ma + vlse\b\().v v8, (a0), a1 + vsetvli t0, zero, e16, m4, ta, ma + vzext.vf2 v24, v8 + sub a2, a2, t1 + vmul.vx v16, v24, a4 + mul t2, t1, a1 vsadd.vx v16, v16, a5 - vsadd.vx v24, v24, a5 vmax.vx v16, v16, zero - vmax.vx v24, v24, zero - vsetvli zero, zero, e8, m4, ta, ma - vnclipu.wx v0, v16, a3 - vnclipu.wx v4, v24, a3 - vssseg2e8.v v0, (t0), a1 - addi t0, t0, 2 - bnez t6, 2b - - mul t3, a1, t2 - sub a2, a2, t2 - add a0, a0, t3 + vsetvli zero, zero, e8, m2, ta, ma + vnclipu.wx v8, v16, a3 + vsetvli zero, t1, e\b, m2, ta, ma + vsse\b\().v v8, (a0), a1 + add a0, a0, t2 bnez a2, 1b ret + .endif endfunc +.endm .variant_cc ff_h264_biweight_pixels_8_rvv func ff_h264_biweight_pixels_8_rvv, zve32x @@ -152,17 +151,12 @@ func ff_h264_biweight_pixels_8_rvv, zve32x ret endfunc -.irp w, 16, 8, 4, 2 -func ff_h264_weight_pixels\w\()_8_rvv, zve32x - lpad 0 - li a6, \w - .if \w == 16 - j ff_h264_weight_pixels_simple_8_rvv - .else - j ff_h264_weight_pixels_8_rvv - .endif -endfunc +h264_weight 8, 2, 16 +h264_weight 8, 4, 32 +h264_weight 8, 8, 64 +h264_weight 8, 16 +.irp w, 16, 8, 4, 2 func ff_h264_biweight_pixels\w\()_8_rvv, zve32x lpad 0 li t6, \w _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".