Take vector reduction out of the loop and unroll. Before: audiodsp.scalarproduct_int16_c: 12321.0 audiodsp.scalarproduct_int16_rvv_i32: 4175.7
After: audiodsp.scalarproduct_int16_c: 12320.5 audiodsp.scalarproduct_int16_rvv_i32: 1230.2 --- libavcodec/riscv/audiodsp_rvv.S | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/libavcodec/riscv/audiodsp_rvv.S b/libavcodec/riscv/audiodsp_rvv.S index af1e07bef9..f7eba2114f 100644 --- a/libavcodec/riscv/audiodsp_rvv.S +++ b/libavcodec/riscv/audiodsp_rvv.S @@ -21,21 +21,22 @@ #include "libavutil/riscv/asm.S" func ff_scalarproduct_int16_rvv, zve32x - vsetivli zero, 1, e32, m1, ta, ma - vmv.s.x v8, zero + vsetvli t0, zero, e32, m8, ta, ma + vmv.v.x v8, zero + vmv.s.x v0, zero 1: - vsetvli t0, a2, e16, m1, ta, ma + vsetvli t0, a2, e16, m4, tu, ma vle16.v v16, (a0) sub a2, a2, t0 vle16.v v24, (a1) sh1add a0, t0, a0 - vwmul.vv v0, v16, v24 + vwmacc.vv v8, v16, v24 sh1add a1, t0, a1 - vsetvli zero, t0, e32, m2, ta, ma - vredsum.vs v8, v0, v8 bnez a2, 1b - vmv.x.s a0, v8 + vsetvli t0, zero, e32, m8, ta, ma + vredsum.vs v0, v8, v0 + vmv.x.s a0, v0 ret endfunc -- 2.40.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".