PR #21048 opened by Rémi Denis-Courmont (Courmisch) URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048 Patch URL: https://code.ffmpeg.org/FFmpeg/FFmpeg/pulls/21048.patch
From d81b88782e181bdee9599e0fac1ca62915dfb723 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <[email protected]> Date: Sat, 29 Nov 2025 17:46:55 +0200 Subject: [PATCH 1/2] lavc/h264idct: R-V V 8-bit h264_luma_dc_dequant_idct This does not improve performance with current hardware due to the poor performance of segmented accesses. Performance should be slightly better with expensive or near-future hardware that I don't have, however it is still limited by two other factors: - There are only 4 elements. - The final stores are necessarily indexed and hit multiple cache lines, thus as slow as scalar. --- libavcodec/riscv/Makefile | 2 +- libavcodec/riscv/h264dsp_init.c | 7 +- libavcodec/riscv/h264idct_dequant_rvv.S | 86 +++++++++++++++++++++++++ 3 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 libavcodec/riscv/h264idct_dequant_rvv.S diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 736f873fe8..3d2a2b4b6f 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -32,7 +32,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o OBJS-$(CONFIG_H264DSP) += riscv/h264dsp_init.o RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264addpx_rvv.o riscv/h264dsp_rvv.o \ - riscv/h264idct_rvv.o + riscv/h264idct_rvv.o riscv/h264idct_dequant_rvv.o OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_init.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264qpel_rvv.o OBJS-$(CONFIG_HEVC_DECODER) += riscv/hevcdsp_init.o diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index f214486bbe..7ab8d38698 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -80,7 +80,8 @@ void ff_h264_idct4_add8_##depth##_rvv(uint8_t **d, const int *soffset, \ const uint8_t nnzc[5 * 8]); \ void ff_h264_idct4_add8_422_##depth##_rvv(uint8_t **d, const int *soffset, \ int16_t *s, int stride, \ - const uint8_t nnzc[5 * 8]); + const uint8_t nnzc[5 * 8]); \ +void ff_h264_luma_dc_dequant_idct_##depth##_rvv(int16_t *d, int16_t *s, int q); IDCT_DEPTH(8) IDCT_DEPTH(9) @@ -174,6 +175,10 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, dsp->h264_idct_add8 = ff_h264_idct4_add8_422_8_rvv; # endif } + + dsp->h264_luma_dc_dequant_idct = + ff_h264_luma_dc_dequant_idct_8_rvv; + if (flags & AV_CPU_FLAG_RVV_I64) { dsp->h264_add_pixels8_clear = ff_h264_add_pixels8_8_rvv; if (flags & AV_CPU_FLAG_RVB) diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S b/libavcodec/riscv/h264idct_dequant_rvv.S new file mode 100644 index 0000000000..73a68a28ab --- /dev/null +++ b/libavcodec/riscv/h264idct_dequant_rvv.S @@ -0,0 +1,86 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright © 2025 Rémi Denis-Courmont. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "libavutil/riscv/asm.S" + +const offsets_8, 1 + .short 0, 64, 256, 320 +endconst + +func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x + lpad 0 + csrwi vxrm, 0 + vsetivli zero, 4, e16, mf2, ta, ma + vlseg4e16.v v8, (a1) + vwadd.vv v16, v8, v9 # z0 + addi t1, sp, 4 * 4 * -3 + vwadd.vv v19, v10, v11 # z3 + addi t2, sp, 4 * 4 * -2 + vwsub.vv v17, v8, v9 # z1 + addi t3, sp, 4 * 4 * -1 + vwsub.vv v18, v10, v11 # z2 + vsetvli zero, zero, e32, m1, ta, ma + vadd.vv v8, v16, v19 + addi sp, sp, 4 * 4 * -4 + vsub.vv v9, v16, v19 + vsub.vv v10, v17, v18 + vadd.vv v11, v17, v18 + vsseg4e32.v v8, (sp) + vle32.v v8, (sp) + vle32.v v9, (t1) + vle32.v v10, (t2) + vle32.v v11, (t3) + vadd.vv v16, v8, v10 # z0 + addi sp, sp, 4 * 4 * 4 + vadd.vv v19, v9, v11 # z3 + lla t0, offsets_8 + vsub.vv v17, v8, v10 # z1 + vsub.vv v18, v9, v11 # z2 + vadd.vv v8, v16, v19 + vadd.vv v9, v17, v18 + vsub.vv v10, v17, v18 + vsub.vv v11, v16, v19 + vle16.v v24, (t0) + vmul.vx v8, v8, a2 + vmul.vx v9, v9, a2 + vmul.vx v10, v10, a2 + vmul.vx v11, v11, a2 + vsetvli zero, zero, e16, mf2, ta, ma + vnclip.wi v16, v8, 8 + addi t1, a0, 2 * 16 * 1 + vnclip.wi v17, v9, 8 + addi t2, a0, 2 * 16 * 4 + vnclip.wi v18, v10, 8 + addi t3, a0, 2 * 16 * 5 + vnclip.wi v19, v11, 8 + vsuxei16.v v16, (a0), v24 + vsuxei16.v v17, (t1), v24 + vsuxei16.v v18, (t2), v24 + vsuxei16.v v19, (t3), v24 + ret +endfunc -- 2.49.1 From 9e42a42b229365250316774269948835e736020d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= <[email protected]> Date: Sat, 29 Nov 2025 22:51:01 +0200 Subject: [PATCH 2/2] lavc/h264idct: R-V V 9-bit h264_luma_dc_dequant_idct Note that, like the C reference, the same function can be used for larger bit depths. --- libavcodec/riscv/h264dsp_init.c | 5 ++- libavcodec/riscv/h264idct_dequant_rvv.S | 55 +++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/libavcodec/riscv/h264dsp_init.c b/libavcodec/riscv/h264dsp_init.c index 7ab8d38698..06cb3c59de 100644 --- a/libavcodec/riscv/h264dsp_init.c +++ b/libavcodec/riscv/h264dsp_init.c @@ -189,8 +189,11 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth, #define IDCT_DEPTH(depth) \ if (bit_depth == depth) { \ - if (zvl128b) \ + if (zvl128b) { \ dsp->h264_idct_add = ff_h264_idct_add_##depth##_rvv; \ + dsp->h264_luma_dc_dequant_idct = \ + ff_h264_luma_dc_dequant_idct_9_rvv; \ + } \ if (flags & AV_CPU_FLAG_RVB) \ dsp->h264_idct8_add = ff_h264_idct8_add_##depth##_rvv; \ if (zvl128b && (flags & AV_CPU_FLAG_RVB)) { \ diff --git a/libavcodec/riscv/h264idct_dequant_rvv.S b/libavcodec/riscv/h264idct_dequant_rvv.S index 73a68a28ab..bc49ca6ad4 100644 --- a/libavcodec/riscv/h264idct_dequant_rvv.S +++ b/libavcodec/riscv/h264idct_dequant_rvv.S @@ -84,3 +84,58 @@ func ff_h264_luma_dc_dequant_idct_8_rvv, zve32x vsuxei16.v v19, (t3), v24 ret endfunc + +const offsets_9, 1 + .short 0, 128, 512, 640 +endconst + +func ff_h264_luma_dc_dequant_idct_9_rvv, zve32x + lpad 0 + csrwi vxrm, 0 + vsetivli zero, 4, e32, m1, ta, ma + vlseg4e32.v v8, (a1) + vadd.vv v16, v8, v9 # z0 + addi t1, sp, 4 * 4 * -3 + vadd.vv v19, v10, v11 # z3 + addi t2, sp, 4 * 4 * -2 + vsub.vv v17, v8, v9 # z1 + addi t3, sp, 4 * 4 * -1 + vsub.vv v18, v10, v11 # z2 + vadd.vv v8, v16, v19 + addi sp, sp, 4 * 4 * -4 + vsub.vv v9, v16, v19 + vsub.vv v10, v17, v18 + vadd.vv v11, v17, v18 + vsseg4e32.v v8, (sp) + vle32.v v8, (sp) + vle32.v v9, (t1) + vle32.v v10, (t2) + vle32.v v11, (t3) + vadd.vv v16, v8, v10 # z0 + addi sp, sp, 4 * 4 * 4 + vadd.vv v19, v9, v11 # z3 + lla t0, offsets_9 + vsub.vv v17, v8, v10 # z1 + vsub.vv v18, v9, v11 # z2 + vadd.vv v8, v16, v19 + vadd.vv v9, v17, v18 + vsub.vv v10, v17, v18 + vsub.vv v11, v16, v19 + vle16.v v24, (t0) + vmul.vx v8, v8, a2 + vmul.vx v9, v9, a2 + vmul.vx v10, v10, a2 + vmul.vx v11, v11, a2 + vssra.vi v16, v8, 8 + addi t1, a0, 4 * 16 * 1 + vssra.vi v17, v9, 8 + addi t2, a0, 4 * 16 * 4 + vssra.vi v18, v10, 8 + addi t3, a0, 4 * 16 * 5 + vssra.vi v19, v11, 8 + vsuxei16.v v16, (a0), v24 + vsuxei16.v v17, (t1), v24 + vsuxei16.v v18, (t2), v24 + vsuxei16.v v19, (t3), v24 + ret +endfunc -- 2.49.1 _______________________________________________ ffmpeg-devel mailing list -- [email protected] To unsubscribe send an email to [email protected]
