Optimize IDCT, inloop filtering, and weighed prediction using RISC-V intrinsics. The performance is elvaluated using 720P videos. Combine with previous optimizations(chroma and luma MC), the FPS is 2.08x faster than the scalar one, while applying only previous optimizations resulted in a speedup of 1.49x.
Signed-off-by: Arnie Chang <arnie.ch...@sifive.com> --- libavcodec/h264dsp.c | 2 + libavcodec/h264dsp.h | 3 +- libavcodec/riscv/Makefile | 4 + libavcodec/riscv/h264_dsp_init_riscv.c | 68 +++ libavcodec/riscv/h264_idct.c | 482 ++++++++++++++++++ libavcodec/riscv/h264_idct.h | 46 ++ libavcodec/riscv/h264_inloop.c | 669 +++++++++++++++++++++++++ libavcodec/riscv/h264_inloop.h | 47 ++ libavcodec/riscv/h264_weighted_sum.c | 273 ++++++++++ libavcodec/riscv/h264_weighted_sum.h | 47 ++ 10 files changed, 1640 insertions(+), 1 deletion(-) create mode 100644 libavcodec/riscv/h264_dsp_init_riscv.c create mode 100644 libavcodec/riscv/h264_idct.c create mode 100644 libavcodec/riscv/h264_idct.h create mode 100644 libavcodec/riscv/h264_inloop.c create mode 100644 libavcodec/riscv/h264_inloop.h create mode 100644 libavcodec/riscv/h264_weighted_sum.c create mode 100644 libavcodec/riscv/h264_weighted_sum.h diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c index 4d2ee10bab..b6e45c15ef 100644 --- a/libavcodec/h264dsp.c +++ b/libavcodec/h264dsp.c @@ -164,5 +164,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc); #elif ARCH_LOONGARCH ff_h264dsp_init_loongarch(c, bit_depth, chroma_format_idc); +#elif ARCH_RISCV + ff_h264dsp_init_riscv(c, bit_depth, chroma_format_idc); #endif } diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h index e0880c4d88..f2f8aa7e60 100644 --- a/libavcodec/h264dsp.h +++ b/libavcodec/h264dsp.h @@ -131,5 +131,6 @@ void ff_h264dsp_init_mips(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth, const int chroma_format_idc); - +void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, + const int chroma_format_idc); #endif /* AVCODEC_H264DSP_H */ diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile index 088efa3b1e..4d54bf35e9 100644 --- a/libavcodec/riscv/Makefile +++ b/libavcodec/riscv/Makefile @@ -24,3 +24,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o OBJS-$(CONFIG_H264QPEL) += riscv/h264_qpel_init_riscv.o RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264_mc_luma.o +OBJS-$(CONFIG_H264DSP) += riscv/h264_dsp_init_riscv.o +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_weighted_sum.o +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_inloop.o +RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_idct.o diff --git a/libavcodec/riscv/h264_dsp_init_riscv.c b/libavcodec/riscv/h264_dsp_init_riscv.c new file mode 100644 index 0000000000..7d41aa98a5 --- /dev/null +++ b/libavcodec/riscv/h264_dsp_init_riscv.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdint.h> +#include "libavutil/attributes.h" +#include "libavutil/cpu.h" +#include "libavcodec/h264dsp.h" +#include "config.h" +#include "h264_inloop.h" +#include "h264_weighted_sum.h" +#include "h264_idct.h" + +av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, const int chroma_format_idc) +{ +#if HAVE_INTRINSICS_RVV + if (bit_depth == 8) { + c->h264_v_loop_filter_luma = h264_v_loop_filter_luma_8_rvv; + c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_8_rvv; + + c->h264_h_loop_filter_luma = h264_h_loop_filter_luma_8_rvv; + c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_8_rvv; + + c->h264_v_loop_filter_chroma = h264_v_loop_filter_chroma_8_rvv; + c->h264_v_loop_filter_chroma_intra = h264_v_loop_filter_chroma_intra_8_rvv; + + if (chroma_format_idc <= 1) { + c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma_8_rvv; + c->h264_h_loop_filter_chroma_intra = h264_h_loop_filter_chroma_intra_8_rvv; + c->h264_h_loop_filter_chroma_mbaff_intra = h264_h_loop_filter_chroma_mbaff_intra_8_rvv; + } + + c->weight_h264_pixels_tab[0] = weight_h264_pixels_16_8_rvv; + c->weight_h264_pixels_tab[1] = weight_h264_pixels_8_8_rvv; + c->weight_h264_pixels_tab[2] = weight_h264_pixels_4_8_rvv; + + c->biweight_h264_pixels_tab[0]= biweight_h264_pixels_16_8_rvv; + c->biweight_h264_pixels_tab[1]= biweight_h264_pixels_8_8_rvv; + c->biweight_h264_pixels_tab[2]= biweight_h264_pixels_4_8_rvv; + + c->h264_idct_add = h264_idct_add_8_rvv; + c->h264_idct_dc_add = h264_idct_dc_add_8_rvv; + c->h264_idct_add16 = h264_idct_add16_8_rvv; + c->h264_idct_add16intra = h264_idct_add16_intra_8_rvv; + if (chroma_format_idc <= 1) + c->h264_idct_add8 = h264_idct_add8_8_rvv; + c->h264_idct8_add = h264_idct8_add_8_rvv; + c->h264_idct8_dc_add = h264_idct8_dc_add_8_rvv; + c->h264_idct8_add4 = h264_idct8_add4_8_rvv; + } +#endif +} diff --git a/libavcodec/riscv/h264_idct.c b/libavcodec/riscv/h264_idct.c new file mode 100644 index 0000000000..3ef6b74421 --- /dev/null +++ b/libavcodec/riscv/h264_idct.c @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264_idct.h" +#if HAVE_INTRINSICS_RVV +#include <riscv_vector.h> + +static const uint8_t scan8[16 * 3 + 3] = +{ + 4 + 1 * 8, 5 + 1 * 8, 4 + 2 * 8, 5 + 2 * 8, + 6 + 1 * 8, 7 + 1 * 8, 6 + 2 * 8, 7 + 2 * 8, + 4 + 3 * 8, 5 + 3 * 8, 4 + 4 * 8, 5 + 4 * 8, + 6 + 3 * 8, 7 + 3 * 8, 6 + 4 * 8, 7 + 4 * 8, + 4 + 6 * 8, 5 + 6 * 8, 4 + 7 * 8, 5 + 7 * 8, + 6 + 6 * 8, 7 + 6 * 8, 6 + 7 * 8, 7 + 7 * 8, + 4 + 8 * 8, 5 + 8 * 8, 4 + 9 * 8, 5 + 9 * 8, + 6 + 8 * 8, 7 + 8 * 8, 6 + 9 * 8, 7 + 9 * 8, + 4 + 11 * 8, 5 + 11 * 8, 4 + 12 * 8, 5 + 12 * 8, + 6 + 11 * 8, 7 + 11 * 8, 6 + 12 * 8, 7 + 12 * 8, + 4 + 13 * 8, 5 + 13 * 8, 4 + 14 * 8, 5 + 14 * 8, + 6 + 13 * 8, 7 + 13 * 8, 6 + 14 * 8, 7 + 14 * 8, + 0 + 0 * 8, 0 + 5 * 8, 0 + 10 * 8 +}; + +void h264_idct_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride) +{ + int16_t temp[16]; + int vl = __riscv_vsetvl_e16m1(4); + + p_block[0] += 32; + + vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 4, vl); + vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 8, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 12, vl); + + // 1-D row idct + vint16m1_t z0 = __riscv_vadd_vv_i16m1(row0, row2, vl); + vint16m1_t z1 = __riscv_vsub_vv_i16m1(row0, row2, vl); + vint16m1_t z2 = __riscv_vsra_vx_i16m1(row1, 1, vl); + z2 = __riscv_vsub_vv_i16m1(z2, row3, vl); + vint16m1_t z3 = __riscv_vsra_vx_i16m1(row3, 1, vl); + z3 = __riscv_vadd_vv_i16m1(z3, row1, vl); + + vint16m1_t result0 = __riscv_vadd_vv_i16m1(z0, z3, vl); + vint16m1_t result1 = __riscv_vadd_vv_i16m1(z1, z2, vl); + vint16m1_t result2 = __riscv_vsub_vv_i16m1(z1, z2, vl); + vint16m1_t result3 = __riscv_vsub_vv_i16m1(z0, z3, vl); + + // transpose + __riscv_vse16_v_i16m1(&temp[0], result0, vl); + __riscv_vse16_v_i16m1(&temp[4], result1, vl); + __riscv_vse16_v_i16m1(&temp[8], result2, vl); + __riscv_vse16_v_i16m1(&temp[12], result3, vl); + __riscv_vlseg4e16_v_i16m1(&row0, &row1, &row2, &row3, &temp[0], vl); + + // 1-D column idct + z0 = __riscv_vadd_vv_i16m1(row0, row2, vl); + z1 = __riscv_vsub_vv_i16m1(row0, row2, vl); + z2 = __riscv_vsra_vx_i16m1(row1, 1, vl); + z2 = __riscv_vsub_vv_i16m1(z2, row3, vl); + z3 = __riscv_vsra_vx_i16m1(row3, 1, vl); + z3 = __riscv_vadd_vv_i16m1(z3, row1, vl); + + result0 = __riscv_vadd_vv_i16m1(z0, z3, vl); + result1 = __riscv_vadd_vv_i16m1(z1, z2, vl); + result2 = __riscv_vsub_vv_i16m1(z1, z2, vl); + result3 = __riscv_vsub_vv_i16m1(z0, z3, vl); + + result0 = __riscv_vsra_vx_i16m1(result0, 6, vl); + result1 = __riscv_vsra_vx_i16m1(result1, 6, vl); + result2 = __riscv_vsra_vx_i16m1(result2, 6, vl); + result3 = __riscv_vsra_vx_i16m1(result3, 6, vl); + + vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl); + vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl); + vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl); + vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl); + + vint16m1_t dst0_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl)); + vint16m1_t dst1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl)); + vint16m1_t dst2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl)); + vint16m1_t dst3_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl)); + + result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl); + result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl); + result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl); + result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl); + + result0 = __riscv_vmax_vx_i16m1(result0, 0, vl); + result1 = __riscv_vmax_vx_i16m1(result1, 0, vl); + result2 = __riscv_vmax_vx_i16m1(result2, 0, vl); + result3 = __riscv_vmax_vx_i16m1(result3, 0, vl); + + vuint8mf2_t result0_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl); + vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl); + vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl); + vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl); + + __riscv_vse8_v_u8mf2(p_dst, result0_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl); + + memset(p_block, 0, sizeof(int16_t) * 16); +} + +void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride) +{ + int vl = __riscv_vsetvl_e16m1(4); + + int dc = (p_block[0] + 32) >> 6; + + if (dc > 255) + dc = 255; + + if (dc < -255) + dc = -255; + + p_block[0] = 0; + + vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst, vl); + vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst + stride, vl); + vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst + stride * 2, vl); + vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst + stride * 3, vl); + + if (dc >= 0) + { + dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl); + dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl); + dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl); + dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl); + } + else + { + dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl); + dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl); + dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl); + dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl); + } + + __riscv_vse8_v_u8m1(p_dst, dst0, vl); + __riscv_vse8_v_u8m1(p_dst + stride, dst1, vl); + __riscv_vse8_v_u8m1(p_dst + stride * 2, dst2, vl); + __riscv_vse8_v_u8m1(p_dst + stride * 3, dst3, vl); +} + +void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[5 * 8]) +{ + for(int i = 0; i < 16; i++) + { + int nnz = nnzc[scan8[i]]; + + if(nnz) + { + if(nnz==1 && p_block[i*16]) + h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + else + h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + } + } +} + +void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[5 * 8]) +{ + for(int i = 0; i < 16; i++) + { + if(nnzc[scan8[i]]) + h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + else if(p_block[i*16]) + h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + } +} + +void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[15*8]) +{ + for(int j = 1; j < 3; j++) + { + for(int i = j * 16; i < j * 16 + 4; i++) + { + if(nnzc[scan8[i]]) + h264_idct_add_8_rvv(p_dst[j - 1] + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + else if(p_block[i * 16]) + h264_idct_dc_add_8_rvv(p_dst[j - 1] + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + } + } +} + +void h264_idct8_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride) +{ + int16_t temp[64]; + int vl = __riscv_vsetvl_e16m1(8); + + p_block[0] += 32; + + vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl); + vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 8, vl); + vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 16, vl); + vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 24, vl); + vint16m1_t row4 = __riscv_vle16_v_i16m1(p_block + 32, vl); + vint16m1_t row5 = __riscv_vle16_v_i16m1(p_block + 40, vl); + vint16m1_t row6 = __riscv_vle16_v_i16m1(p_block + 48, vl); + vint16m1_t row7 = __riscv_vle16_v_i16m1(p_block + 56, vl); + + // 1-D row idct + vint16m1_t a0 = __riscv_vadd_vv_i16m1(row0, row4, vl); + vint16m1_t a2 = __riscv_vsub_vv_i16m1(row0, row4, vl); + vint16m1_t a4 = __riscv_vsra_vx_i16m1(row2, 1, vl); + a4 = __riscv_vsub_vv_i16m1(a4, row6, vl); + vint16m1_t a6 = __riscv_vsra_vx_i16m1(row6, 1, vl); + a6 = __riscv_vadd_vv_i16m1(row2, a6, vl); + + vint16m1_t b0 = __riscv_vadd_vv_i16m1(a0, a6, vl); + vint16m1_t b2 = __riscv_vadd_vv_i16m1(a2, a4, vl); + vint16m1_t b4 = __riscv_vsub_vv_i16m1(a2, a4, vl); + vint16m1_t b6 = __riscv_vsub_vv_i16m1(a0, a6, vl); + + vint16m1_t a1 = __riscv_vsra_vx_i16m1(row7, 1, vl); + a1 = __riscv_vsub_vv_i16m1(row5, a1, vl); + a1 = __riscv_vsub_vv_i16m1(a1, row3, vl); + a1 = __riscv_vsub_vv_i16m1(a1, row7, vl); + vint16m1_t a3 = __riscv_vsra_vx_i16m1(row3, 1, vl); + a3 = __riscv_vsub_vv_i16m1(row7, a3, vl); + a3 = __riscv_vadd_vv_i16m1(a3, row1, vl); + a3 = __riscv_vsub_vv_i16m1(a3, row3, vl); + vint16m1_t a5 = __riscv_vsra_vx_i16m1(row5, 1, vl); + a5 = __riscv_vsub_vv_i16m1(a5, row1, vl); + a5 = __riscv_vadd_vv_i16m1(a5, row7, vl); + a5 = __riscv_vadd_vv_i16m1(a5, row5, vl); + vint16m1_t a7 = __riscv_vsra_vx_i16m1(row1, 1, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row3, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row5, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row1, vl); + + vint16m1_t b1 = __riscv_vsra_vx_i16m1(a7, 2, vl); + b1 = __riscv_vadd_vv_i16m1(b1, a1, vl); + vint16m1_t b3 = __riscv_vsra_vx_i16m1(a5, 2, vl); + b3 = __riscv_vadd_vv_i16m1(b3, a3, vl); + vint16m1_t b5 = __riscv_vsra_vx_i16m1(a3, 2, vl); + b5 = __riscv_vsub_vv_i16m1(b5, a5, vl); + vint16m1_t b7 = __riscv_vsra_vx_i16m1(a1, 2, vl); + b7 = __riscv_vsub_vv_i16m1(a7, b7, vl); + + vint16m1_t result0 = __riscv_vadd_vv_i16m1(b0, b7, vl); + vint16m1_t result7 = __riscv_vsub_vv_i16m1(b0, b7, vl); + vint16m1_t result1 = __riscv_vadd_vv_i16m1(b2, b5, vl); + vint16m1_t result6 = __riscv_vsub_vv_i16m1(b2, b5, vl); + vint16m1_t result2 = __riscv_vadd_vv_i16m1(b4, b3, vl); + vint16m1_t result5 = __riscv_vsub_vv_i16m1(b4, b3, vl); + vint16m1_t result3 = __riscv_vadd_vv_i16m1(b6, b1, vl); + vint16m1_t result4 = __riscv_vsub_vv_i16m1(b6, b1, vl); + + // transpose + __riscv_vse16_v_i16m1(&temp[0], result0, vl); + __riscv_vse16_v_i16m1(&temp[8], result1, vl); + __riscv_vse16_v_i16m1(&temp[16], result2, vl); + __riscv_vse16_v_i16m1(&temp[24], result3, vl); + __riscv_vse16_v_i16m1(&temp[32], result4, vl); + __riscv_vse16_v_i16m1(&temp[40], result5, vl); + __riscv_vse16_v_i16m1(&temp[48], result6, vl); + __riscv_vse16_v_i16m1(&temp[56], result7, vl); + + __riscv_vlseg8e16_v_i16m1(&row0, &row1, &row2, &row3, &row4, &row5, &row6, &row7, &temp[0], vl); + + // 1-D column idct + a0 = __riscv_vadd_vv_i16m1(row0, row4, vl); + a2 = __riscv_vsub_vv_i16m1(row0, row4, vl); + a4 = __riscv_vsra_vx_i16m1(row2, 1, vl); + a4 = __riscv_vsub_vv_i16m1(a4, row6, vl); + a6 = __riscv_vsra_vx_i16m1(row6, 1, vl); + a6 = __riscv_vadd_vv_i16m1(row2, a6, vl); + + b0 = __riscv_vadd_vv_i16m1(a0, a6, vl); + b2 = __riscv_vadd_vv_i16m1(a2, a4, vl); + b4 = __riscv_vsub_vv_i16m1(a2, a4, vl); + b6 = __riscv_vsub_vv_i16m1(a0, a6, vl); + + a1 = __riscv_vsra_vx_i16m1(row7, 1, vl); + a1 = __riscv_vsub_vv_i16m1(row5, a1, vl); + a1 = __riscv_vsub_vv_i16m1(a1, row3, vl); + a1 = __riscv_vsub_vv_i16m1(a1, row7, vl); + a3 = __riscv_vsra_vx_i16m1(row3, 1, vl); + a3 = __riscv_vsub_vv_i16m1(row7, a3, vl); + a3 = __riscv_vadd_vv_i16m1(a3, row1, vl); + a3 = __riscv_vsub_vv_i16m1(a3, row3, vl); + a5 = __riscv_vsra_vx_i16m1(row5, 1, vl); + a5 = __riscv_vsub_vv_i16m1(a5, row1, vl); + a5 = __riscv_vadd_vv_i16m1(a5, row7, vl); + a5 = __riscv_vadd_vv_i16m1(a5, row5, vl); + a7 = __riscv_vsra_vx_i16m1(row1, 1, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row3, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row5, vl); + a7 = __riscv_vadd_vv_i16m1(a7, row1, vl); + + b1 = __riscv_vsra_vx_i16m1(a7, 2, vl); + b1 = __riscv_vadd_vv_i16m1(b1, a1, vl); + b3 = __riscv_vsra_vx_i16m1(a5, 2, vl); + b3 = __riscv_vadd_vv_i16m1(b3, a3, vl); + b5 = __riscv_vsra_vx_i16m1(a3, 2, vl); + b5 = __riscv_vsub_vv_i16m1(b5, a5, vl); + b7 = __riscv_vsra_vx_i16m1(a1, 2, vl); + b7 = __riscv_vsub_vv_i16m1(a7, b7, vl); + + result0 = __riscv_vadd_vv_i16m1(b0, b7, vl); + result1 = __riscv_vadd_vv_i16m1(b2, b5, vl); + result2 = __riscv_vadd_vv_i16m1(b4, b3, vl); + result3 = __riscv_vadd_vv_i16m1(b6, b1, vl); + result4 = __riscv_vsub_vv_i16m1(b6, b1, vl); + result5 = __riscv_vsub_vv_i16m1(b4, b3, vl); + result6 = __riscv_vsub_vv_i16m1(b2, b5, vl); + result7 = __riscv_vsub_vv_i16m1(b0, b7, vl); + + // normalize and write to destination + result0 = __riscv_vsra_vx_i16m1(result0, 6, vl); + result1 = __riscv_vsra_vx_i16m1(result1, 6, vl); + result2 = __riscv_vsra_vx_i16m1(result2, 6, vl); + result3 = __riscv_vsra_vx_i16m1(result3, 6, vl); + result4 = __riscv_vsra_vx_i16m1(result4, 6, vl); + result5 = __riscv_vsra_vx_i16m1(result5, 6, vl); + result6 = __riscv_vsra_vx_i16m1(result6, 6, vl); + result7 = __riscv_vsra_vx_i16m1(result7, 6, vl); + + vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl); + vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl); + vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl); + vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl); + vuint8mf2_t dst4 = __riscv_vle8_v_u8mf2(p_dst + stride * 4, vl); + vuint8mf2_t dst5 = __riscv_vle8_v_u8mf2(p_dst + stride * 5, vl); + vuint8mf2_t dst6 = __riscv_vle8_v_u8mf2(p_dst + stride * 6, vl); + vuint8mf2_t dst7 = __riscv_vle8_v_u8mf2(p_dst + stride * 7, vl); + + vint16m1_t dst0_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl)); + vint16m1_t dst1_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl)); + vint16m1_t dst2_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl)); + vint16m1_t dst3_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl)); + vint16m1_t dst4_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst4, vl)); + vint16m1_t dst5_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst5, vl)); + vint16m1_t dst6_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst6, vl)); + vint16m1_t dst7_w = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst7, vl)); + + result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl); + result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl); + result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl); + result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl); + result4 = __riscv_vadd_vv_i16m1(result4, dst4_w, vl); + result5 = __riscv_vadd_vv_i16m1(result5, dst5_w, vl); + result6 = __riscv_vadd_vv_i16m1(result6, dst6_w, vl); + result7 = __riscv_vadd_vv_i16m1(result7, dst7_w, vl); + + result0 = __riscv_vmax_vx_i16m1(result0, 0, vl); + result1 = __riscv_vmax_vx_i16m1(result1, 0, vl); + result2 = __riscv_vmax_vx_i16m1(result2, 0, vl); + result3 = __riscv_vmax_vx_i16m1(result3, 0, vl); + result4 = __riscv_vmax_vx_i16m1(result4, 0, vl); + result5 = __riscv_vmax_vx_i16m1(result5, 0, vl); + result6 = __riscv_vmax_vx_i16m1(result6, 0, vl); + result7 = __riscv_vmax_vx_i16m1(result7, 0, vl); + + vuint8mf2_t result0_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl); + vuint8mf2_t result1_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl); + vuint8mf2_t result2_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl); + vuint8mf2_t result3_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl); + vuint8mf2_t result4_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result4), 0, vl); + vuint8mf2_t result5_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result5), 0, vl); + vuint8mf2_t result6_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result6), 0, vl); + vuint8mf2_t result7_n = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result7), 0, vl); + + __riscv_vse8_v_u8mf2(p_dst, result0_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 4, result4_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 5, result5_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 6, result6_n, vl); + __riscv_vse8_v_u8mf2(p_dst + stride * 7, result7_n, vl); + + memset(p_block, 0, sizeof(int16_t) * 64); +} + +void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride) +{ + int count = 8; + uint8_t *p_dst_iter = p_dst; + + int dc = (p_block[0] + 32) >> 6; + + if (dc > 255) + dc = 255; + + if (dc < -255) + dc = -255; + + p_block[0] = 0; + + while (count > 0) + { + int vl = __riscv_vsetvl_e16m1(8); + + vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl); + vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl); + vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl); + vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl); + vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl); + vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl); + vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl); + vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl); + + if (dc >= 0) + { + dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl); + dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl); + dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl); + dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl); + dst4 = __riscv_vsaddu_vx_u8m1(dst4, dc, vl); + dst5 = __riscv_vsaddu_vx_u8m1(dst5, dc, vl); + dst6 = __riscv_vsaddu_vx_u8m1(dst6, dc, vl); + dst7 = __riscv_vsaddu_vx_u8m1(dst7, dc, vl); + } + else + { + dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl); + dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl); + dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl); + dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl); + dst4 = __riscv_vssubu_vx_u8m1(dst4, -dc, vl); + dst5 = __riscv_vssubu_vx_u8m1(dst5, -dc, vl); + dst6 = __riscv_vssubu_vx_u8m1(dst6, -dc, vl); + dst7 = __riscv_vssubu_vx_u8m1(dst7, -dc, vl); + } + + __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride, dst1, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 2, dst2, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 3, dst3, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 4, dst4, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 5, dst5, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 6, dst6, vl); + __riscv_vse8_v_u8m1(p_dst_iter + stride * 7, dst7, vl); + + count -= vl; + p_dst_iter += vl; + } +} + +void h264_idct8_add4_8_rvv(uint8_t *p_dst, const int *p_block_offset, + int16_t *p_block, int stride, const uint8_t nnzc[5 * 8]) +{ + for(int i = 0; i < 16; i += 4) + { + int nnz = nnzc[scan8[i]]; + + if(nnz) + { + if(nnz == 1 && p_block[i * 16]) + h264_idct8_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + else + h264_idct8_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * sizeof(pixel), stride); + } + } +} +#endif + diff --git a/libavcodec/riscv/h264_idct.h b/libavcodec/riscv/h264_idct.h new file mode 100644 index 0000000000..4b942c35f7 --- /dev/null +++ b/libavcodec/riscv/h264_idct.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RISCV_H264_IDCT_H +#define AVCODEC_RISCV_H264_IDCT_H +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stddef.h> +#include "config.h" + +#if HAVE_INTRINSICS_RVV +typedef unsigned char pixel; + +void h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride); +void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride); +void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[5 * 8]); +void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[5 * 8]); +void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t *p_block, int stride, + const uint8_t nnzc[15*8]); +void h264_idct8_add_8_rvv(uint8_t *_dst, int16_t *_block, int stride); +void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride); +void h264_idct8_add4_8_rvv(uint8_t *dst, const int *block_offset, + int16_t *block, int stride, const uint8_t nnzc[5 * 8]); +#endif +#endif \ No newline at end of file diff --git a/libavcodec/riscv/h264_inloop.c b/libavcodec/riscv/h264_inloop.c new file mode 100644 index 0000000000..d14cf4dd7a --- /dev/null +++ b/libavcodec/riscv/h264_inloop.c @@ -0,0 +1,669 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264_inloop.h" +#if HAVE_INTRINSICS_RVV +#include <riscv_vector.h> + +__attribute__((always_inline)) static void extend_tc0_2(vint8mf2_t *p_tc0_i8, int8_t *p_tc0, size_t start, int vl) +{ + if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3]) + { + *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl); + } + else + { + const uint8_t tc02_index[] = {0, 0, 1, 1, 2, 2, 3, 3}; + vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4); + vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc02_index + start, vl); + *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl); + } +} + +__attribute__((always_inline)) static void extend_tc0(vint8mf2_t *p_tc0_i8, int8_t *p_tc0, size_t start, int vl) +{ + if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3]) + { + *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl); + } + else + { + const uint8_t tc01_index[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4); + vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc01_index + start, vl); + *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl); + } +} + +__attribute__((always_inline)) static void luma_core(vuint8mf2_t *p_p1_dst, vuint8mf2_t *p_p0_dst, + vuint8mf2_t *p_q0_dst, vuint8mf2_t *p_q1_dst, + vuint8mf2_t p2, vuint8mf2_t p1, vuint8mf2_t p0, + vuint8mf2_t q0, vuint8mf2_t q1, vuint8mf2_t q2, + vint8mf2_t tc8, int alpha, int beta, int vl) +{ + vint16m1_t p2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl)); + vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl)); + vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl)); + vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl)); + vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl)); + vint16m1_t q2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl)); + + vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl); + vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl); + vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl); + vint16m1_t sub_p2_p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl); + vint16m1_t sub_q2_q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl); + + vint16m1_t minus_sub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl); + vint16m1_t minus_sub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl); + vint16m1_t minus_sub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl); + vint16m1_t minus_sub_p2_p0 = __riscv_vrsub_vx_i16m1(sub_p2_p0, 0, vl); + vint16m1_t minus_sub_q2_q0 = __riscv_vrsub_vx_i16m1(sub_q2_q0, 0, vl); + + vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, minus_sub_q0_p0, vl); + vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, minus_sub_p1_p0, vl); + vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, minus_sub_q1_q0, vl); + vint16m1_t abs_diff2 = __riscv_vmax_vv_i16m1(sub_p2_p0, minus_sub_p2_p0, vl); + vint16m1_t abs_diff3 = __riscv_vmax_vv_i16m1(sub_q2_q0, minus_sub_q2_q0, vl); + + vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl); + vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl); + vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abs_diff11, alpha, vl); + vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abs_diff12, beta, vl); + vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abs_diff13, beta, vl); + vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abs_diff2, beta, vl); + vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abs_diff3, beta, vl); + + vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond_mask, vl); + cond1 = __riscv_vmand_mm_b16(cond1, cond12, vl); + cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl); + cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl); + cond3 = __riscv_vmand_mm_b16(cond3, cond1, vl); + + // p1 + vint16m1_t sum_p0_q0 = __riscv_vaadd_vv_i16m1(p0_i16, q0_i16, vl); + vint16m1_t p1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, p2_i16, vl); + p1_new_i16 = __riscv_vsra_vx_i16m1(p1_new_i16, 1, vl); + vint16m1_t p1_new_upper = __riscv_vadd_vv_i16m1(p1_i16, tc, vl); + vint16m1_t p1_new_lower = __riscv_vsub_vv_i16m1(p1_i16, tc, vl); + p1_new_i16 = __riscv_vmax_vv_i16m1(p1_new_i16, p1_new_lower, vl); + p1_new_i16 = __riscv_vmin_vv_i16m1(p1_new_i16, p1_new_upper, vl); + *p_p1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond2, p1, __riscv_vreinterpret_v_i16m1_u16m1(p1_new_i16), vl); + vint16m1_t tc_adjust = __riscv_vadc_vxm_i16m1(tc, 0, cond2, vl); + + // q1 + vint16m1_t q1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, q2_i16, vl); + q1_new_i16 = __riscv_vsra_vx_i16m1(q1_new_i16, 1, vl); + vint16m1_t q1_new_upper = __riscv_vadd_vv_i16m1(q1_i16, tc, vl); + vint16m1_t q1_new_lower = __riscv_vsub_vv_i16m1(q1_i16, tc, vl); + q1_new_i16 = __riscv_vmax_vv_i16m1(q1_new_i16, q1_new_lower, vl); + q1_new_i16 = __riscv_vmin_vv_i16m1(q1_new_i16, q1_new_upper, vl); + *p_q1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond3, q1, __riscv_vreinterpret_v_i16m1_u16m1(q1_new_i16), vl); + tc_adjust = __riscv_vadc_vxm_i16m1(tc_adjust, 0, cond3, vl); + + // p0, q0 + vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl); + vint16m1_t delta_i16 = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl); + delta_i16 = __riscv_vadd_vv_i16m1(delta_i16, sub_p1_q1, vl); + delta_i16 = __riscv_vssra_vx_i16m1(delta_i16, 3, vl); + delta_i16 = __riscv_vmin_vv_i16m1(delta_i16, tc_adjust, vl); + delta_i16 = __riscv_vmax_vv_i16m1(delta_i16, __riscv_vrsub_vx_i16m1(tc_adjust, 0, vl), vl); + + vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta_i16, vl); + vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta_i16, vl); + p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl); + q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl); + + *p_p0_dst= __riscv_vnclipu_wx_u8mf2_mu(cond1, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl); + *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond1, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl); +} + +__attribute__((always_inline)) static void v_loop_filter_luma(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta, int8_t *p_tc0) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + int tc_offset = 0; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vint8mf2_t tc8; + extend_tc0(&tc8, p_tc0, tc_offset, vl); + + vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl); + vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl); + vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl); + vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl); + vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl); + vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl); + + vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst; + luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, tc8, alpha, beta, vl); + + __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl); + __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl); + + count -= vl; + tc_offset = tc_offset + vl; + p_iter = p_iter + vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h_loop_filter_luma(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta, int8_t *p_tc0) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + int tc_offset = 0; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vint8mf2_t tc8; + extend_tc0(&tc8, p_tc0, tc_offset, vl); + + vuint8mf2_t p2, p1, p0, q0, q1, q2; + __riscv_vlsseg6e8_v_u8mf2(&p2, &p1, &p0, &q0, &q1, &q2, p_iter - 3, stride, width); + + vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst; + luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, tc8, alpha, beta, vl); + + __riscv_vssseg4e8_v_u8mf2(p_iter - 2, stride, p1_dst, p0_dst, q0_dst, q1_dst, 16); + + count -= vl; + tc_offset = tc_offset + vl; + p_iter = p_iter + vl * stride; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void chroma_core(vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst, + vuint8mf2_t p1, vuint8mf2_t p0, vuint8mf2_t q0, + vuint8mf2_t q1, vint8mf2_t tc8, int alpha, + int beta, int vl) +{ + vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl)); + vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl)); + vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl)); + vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl)); + + vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl); + vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl); + vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl); + + vint16m1_t rsub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl); + vint16m1_t rsub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl); + vint16m1_t rsub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl); + + vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, rsub_q0_p0, vl); + vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, rsub_p1_p0, vl); + vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, rsub_q1_q0, vl); + + vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl); + vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl); + vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16_mu(cond_mask, cond_mask, abs_diff11, alpha, vl); + vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, abs_diff12, beta, vl); + vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, abs_diff13, beta, vl); + + vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl); + vint16m1_t delta = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl); + delta = __riscv_vadd_vv_i16m1(delta, sub_p1_q1, vl); + delta = __riscv_vssra_vx_i16m1(delta, 3, vl); + delta = __riscv_vmin_vv_i16m1(delta, tc, vl); + delta = __riscv_vmax_vv_i16m1(delta, __riscv_vrsub_vx_i16m1(tc, 0, vl), vl); + + vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta, vl); + vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta, vl); + p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl); + q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl); + + *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl); + *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl); +} + +__attribute__((always_inline)) static void v_loop_filter_chroma(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta, int8_t *p_tc0) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + int tc_offset = 0; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vint8mf2_t tc8; + extend_tc0_2(&tc8, p_tc0, tc_offset, vl); + + vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl); + vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl); + vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl); + vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl); + + vuint8mf2_t p0_dst, q0_dst; + chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl); + + __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl); + + count -= vl; + tc_offset += vl; + p_iter = p_iter + vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h_loop_filter_chroma(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta, int8_t *p_tc0) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + int tc_offset = 0; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vint8mf2_t tc8; + extend_tc0_2(&tc8, p_tc0, tc_offset, vl); + + vuint8mf2_t p1, p0, q0, q1; + __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl); + + vuint8mf2_t p0_dst, q0_dst; + chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl); + + __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl); + + count -= vl; + tc_offset = tc_offset + vl; + p_iter = p_iter + vl * stride; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void luma_intra_core(vuint8mf2_t *p_p2_dst, vuint8mf2_t *p_p1_dst, + vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst, + vuint8mf2_t *p_q1_dst, vuint8mf2_t *p_q2_dst, + vuint8mf2_t p3, vuint8mf2_t p2, vuint8mf2_t p1, + vuint8mf2_t p0, vuint8mf2_t q0, vuint8mf2_t q1, + vuint8mf2_t q2, vuint8mf2_t q3, int alpha, + int beta, int vl) +{ + vint16m1_t p3_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p3, 0, vl)); + vint16m1_t p2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl)); + vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl)); + vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl)); + vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl)); + vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl)); + vint16m1_t q2_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl)); + vint16m1_t q3_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q3, 0, vl)); + + // p0 + vint16m1_t sum_p1p0q0 = __riscv_vadd_vv_i16m1(p0_i16, p1_i16, vl); + sum_p1p0q0 = __riscv_vadd_vv_i16m1(sum_p1p0q0, q0_i16, vl); + + vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl); + vint16m1_t p0_new2_i16 = __riscv_vadd_vv_i16m1(p2_i16, q1_i16, vl); + + // p1 + vint16m1_t p1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p1p0q0, p2_i16, vl); + + // q0 + vint16m1_t sum_p0q0q1 = __riscv_vadd_vv_i16m1(p0_i16, q0_i16, vl); + sum_p0q0q1 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q1_i16, vl); + + vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl); + vint16m1_t q0_new2_i16 = __riscv_vadd_vv_i16m1(q2_i16, p1_i16, vl); + + // q1 + vint16m1_t q1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q2_i16, vl); + + p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl); + p0_new2_i16 = __riscv_vmacc_vx_i16m1(p0_new2_i16, 2, sum_p1p0q0, vl); + vint16m1_t p2_new1_i16 = __riscv_vmadd_vx_i16m1(p3_i16, 2, sum_p1p0q0, vl); + p2_new1_i16 = __riscv_vmacc_vx_i16m1(p2_new1_i16, 3, p2_i16, vl); + q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl); + q0_new2_i16 = __riscv_vmacc_vx_i16m1(q0_new2_i16, 2, sum_p0q0q1, vl); + vint16m1_t q2_new1_i16 = __riscv_vmadd_vx_i16m1(q3_i16, 2, sum_p0q0q1, vl); + q2_new1_i16 = __riscv_vmacc_vx_i16m1(q2_new1_i16, 3, q2_i16, vl); + + vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl); + vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl); + vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl); + vint16m1_t sub_p2p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl); + vint16m1_t sub_q2q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl); + + vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl); + vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl); + vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl); + vint16m1_t rsub_p2p0 = __riscv_vrsub_vx_i16m1(sub_p2p0, 0, vl); + vint16m1_t rsub_q2q0 = __riscv_vrsub_vx_i16m1(sub_q2q0, 0, vl); + + vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(rsub_q0p0, sub_q0p0, vl); + vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(rsub_p1p0, sub_p1p0, vl); + vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(rsub_q1q0, sub_q1q0, vl); + vint16m1_t abd_p2p0 = __riscv_vmax_vv_i16m1(rsub_p2p0, sub_p2p0, vl); + vint16m1_t abd_q2q0 = __riscv_vmax_vv_i16m1(rsub_q2q0, sub_q2q0, vl); + + vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl); + vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abd_p1p0_, beta, vl); + vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abd_q1q0, beta, vl); + vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, (alpha >> 2) + 2, vl); + vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abd_p2p0, beta, vl); + vbool16_t cond4 = __riscv_vmslt_vx_i16m1_b16(abd_q2q0, beta, vl); + + vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond12, vl); + cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl); + cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl); + cond3 = __riscv_vmand_mm_b16(cond3, cond2, vl); + cond4 = __riscv_vmand_mm_b16(cond4, cond2, vl); + + vuint8mf2_t p0_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, vl); + vuint8mf2_t p0_new2_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new2_i16), 3, vl); + vuint8mf2_t p1_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p1_new1_i16), 2, vl); + vuint8mf2_t p2_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p2_new1_i16), 3, vl); + vuint8mf2_t q0_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, vl); + vuint8mf2_t q0_new2_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new2_i16), 3, vl); + vuint8mf2_t q1_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q1_new1_i16), 2, vl); + vuint8mf2_t q2_new1_u8 = __riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q2_new1_i16), 3, vl); + + *p_p1_dst = __riscv_vmerge_vvm_u8mf2(p1, p1_new1_u8, cond3, vl); + *p_p2_dst = __riscv_vmerge_vvm_u8mf2(p2, p2_new1_u8, cond3, vl); + *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0_new1_u8, p0_new2_u8, cond3, vl); + *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0, *p_p0_dst, cond1, vl); + + *p_q0_dst = __riscv_vmerge_vvm_u8mf2(q0, q0_new1_u8, cond1, vl); + *p_q0_dst = __riscv_vmerge_vvm_u8mf2(*p_q0_dst, q0_new2_u8, cond4, vl); + *p_q1_dst = __riscv_vmerge_vvm_u8mf2(q1, q1_new1_u8, cond4, vl); + *p_q2_dst = __riscv_vmerge_vvm_u8mf2(q2, q2_new1_u8, cond4, vl); +} + +__attribute__((always_inline)) static void v_loop_filter_luma_intra(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vuint8mf2_t p3 = __riscv_vle8_v_u8mf2(p_iter - 4 * stride, vl); + vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl); + vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl); + vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl); + vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl); + vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl); + vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl); + vuint8mf2_t q3 = __riscv_vle8_v_u8mf2(p_iter + 3 * stride, vl); + + vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst; + + luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst, + p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl); + + __riscv_vse8_v_u8mf2(p_iter - stride * 3, p2_dst, vl); + __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl); + __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl); + __riscv_vse8_v_u8mf2(p_iter + stride * 2, q2_dst, vl); + + count -= vl; + p_iter = p_iter + vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h_loop_filter_luma_intra(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vuint8mf2_t p3, p2, p1, p0, q0, q1, q2, q3; + __riscv_vlsseg8e8_v_u8mf2(&p3, &p2, &p1, &p0, + &q0, &q1, &q2, &q3, p_iter - 4, stride, 16); + + vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst; + + luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst, + p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl); + + __riscv_vssseg6e8_v_u8mf2(p_iter - 3, stride, + p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst, 16); + + count -= vl; + p_iter = p_iter + vl * stride; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void chroma_intra_core(vuint8mf2_t *p_p0_dst, vuint8mf2_t *p_q0_dst, + vuint8mf2_t p1, vuint8mf2_t p0, + vuint8mf2_t q0, vuint8mf2_t q1, + int alpha, int beta, int vl) +{ + vint16m1_t p1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl)); + vint16m1_t p0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl)); + vint16m1_t q0_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl)); + vint16m1_t q1_i16 = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl)); + + vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl); + vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl); + vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl); + + vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl); + vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl); + vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl); + + vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(sub_q0p0, rsub_q0p0, vl); + vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(sub_p1p0, rsub_p1p0, vl); + vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(sub_q1q0, rsub_q1q0, vl); + + vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl); + vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, abd_p1p0_, beta, vl); + vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, abd_q1q0, beta, vl); + + vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl); + vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl); + p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl); + q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl); + + *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, __riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, vl); + *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, __riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, vl); +} + +__attribute__((always_inline)) static void v_loop_filter_chroma_intra(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl); + vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl); + vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl); + vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl); + + vuint8mf2_t p0_dst, q0_dst; + chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl); + + __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl); + __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl); + + count -= vl; + p_iter = p_iter + vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h_loop_filter_chroma_intra(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(width); + + vuint8mf2_t p1, p0, q0, q1; + __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl); + + vuint8mf2_t p0_dst, q0_dst; + chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl); + + __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl); + + count -= vl; + p_iter = p_iter + vl * stride; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h_loop_filter_chroma_mbaff_intra(uint8_t *p_pix, ptrdiff_t stride, + int width, int alpha, int beta) +{ + uint8_t *p_iter = p_pix; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_TONEARESTUP); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8mf2(count); + + vuint8mf2_t p1, p0, q0, q1; + __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl); + + vuint8mf2_t p0_dst, q0_dst; + chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl); + + __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl); + + count -= vl; + p_iter = p_iter + vl * stride; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0) +{ + v_loop_filter_luma(pix, stride, 16, alpha, beta, p_tc0); +} + +void h264_h_loop_filter_luma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0) +{ + h_loop_filter_luma(p_pix, stride, 16, alpha, beta, p_tc0); +} + +void h264_v_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0) +{ + v_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0); +} + +void h264_h_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta, int8_t *p_tc0) +{ + h_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0); +} + +void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta) +{ + v_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta); +} + +void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta) +{ + h_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta); +} + +void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta) +{ + v_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta); +} + +void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta) +{ + h_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta); +} + +void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta) +{ + h_loop_filter_chroma_mbaff_intra(p_pix, stride, 4, alpha, beta); +} +#endif diff --git a/libavcodec/riscv/h264_inloop.h b/libavcodec/riscv/h264_inloop.h new file mode 100644 index 0000000000..3c60e45395 --- /dev/null +++ b/libavcodec/riscv/h264_inloop.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RISCV_H264_INLOOP_H +#define AVCODEC_RISCV_H264_INLOOP_H +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stddef.h> +#include "config.h" + +#if HAVE_INTRINSICS_RVV +typedef unsigned char pixel; + +void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +void h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); + +void h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); +void h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, int beta, int8_t *tc0); + +void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta); +void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta); + +void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta); +void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta); + +void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int alpha, int beta); +#endif +#endif \ No newline at end of file diff --git a/libavcodec/riscv/h264_weighted_sum.c b/libavcodec/riscv/h264_weighted_sum.c new file mode 100644 index 0000000000..0ba57d0acc --- /dev/null +++ b/libavcodec/riscv/h264_weighted_sum.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "h264_weighted_sum.h" +#if HAVE_INTRINSICS_RVV +#include <riscv_vector.h> + +typedef unsigned char pixel; + +__attribute__((always_inline)) static void h264_weight_128(uint8_t *p_block, ptrdiff_t stride, int width, + int height, int log2_den, int offset) + +{ + uint8_t *p_block_iter = p_block; + + const unsigned char weight = 128; + short value = (unsigned)offset << log2_den; + value += (1 << (log2_den - 1)); + + int shift = log2_den; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_DOWNWARD); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8m1(count); + uint8_t *p_begin = p_block_iter; + + for (int j = 0; j < height; j += 2) + { + vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl); + vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl); + + vint16m2_t result0_w, result1_w; + + result0_w = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row0, vl)); + result1_w = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row1, vl)); + + result0_w = __riscv_vsll_vx_i16m2(result0_w, 7, vl); + result1_w = __riscv_vsll_vx_i16m2(result1_w, 7, vl); + + result0_w = __riscv_vadd_vx_i16m2(result0_w, offset, vl); + result1_w = __riscv_vadd_vx_i16m2(result1_w, offset, vl); + + result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl); + result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl); + + vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl); + vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl); + + __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl); + p_block_iter += stride; + __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl); + p_block_iter += stride; + } + + p_block_iter = p_begin + vl; + count -= vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h264_weight_normal(uint8_t *p_block, ptrdiff_t stride, + int width, int height, int log2_den, + int weight, int offset) + +{ + uint8_t *p_block_iter = p_block; + + short value = (unsigned)offset << log2_den; + + if (log2_den) + value += (1 << (log2_den - 1)); + + int shift = log2_den; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_DOWNWARD); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8m1(count); + uint8_t *p_begin = p_block_iter; + + vint8m1_t weight_v = __riscv_vmv_v_x_i8m1(weight, vl); + + for (int j = 0; j < height; j += 2) + { + vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl); + vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl); + + vint16m2_t result0_w, result1_w; + + result0_w = __riscv_vwmulsu_vv_i16m2(weight_v, row0, vl); + result1_w = __riscv_vwmulsu_vv_i16m2(weight_v, row1, vl); + + result0_w = __riscv_vsadd_vx_i16m2(result0_w, value, vl); + result1_w = __riscv_vsadd_vx_i16m2(result1_w, value, vl); + + result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl); + result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl); + + vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl); + vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl); + + __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl); + p_block_iter += stride; + __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl); + p_block_iter += stride; + } + + p_block_iter = p_begin + vl; + count -= vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +__attribute__((always_inline)) static void h264_biweight(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int width, int height, int log2_den, + int weightd, int weights, int offset) +{ + uint8_t *p_dst_iter = p_dst; + uint8_t *p_src_iter = p_src; + short value = (unsigned int)((offset + 1) | 1) << log2_den; + int shift = log2_den + 1; + + size_t vxrm = __builtin_rvv_vgetvxrm(); + __builtin_rvv_vsetvxrm(VE_DOWNWARD); + + int count = width; + + while (count > 0) + { + int vl = __riscv_vsetvl_e8m1(count); + uint8_t *p_src_begin = p_src_iter; + uint8_t *p_dst_begin = p_dst_iter; + + for (int j = 0; j < height; j += 2) + { + vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl); + p_src_iter += stride; + vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl); + p_src_iter += stride; + + vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl); + vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl); + + vint16m2_t result0_w, result1_w; + + result0_w = __riscv_vmv_v_x_i16m2(value, vl); + result1_w = __riscv_vmv_v_x_i16m2(value, vl); + + result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weightd, dst0, vl); + result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weightd, dst1, vl); + + result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weights, src0, vl); + result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weights, src1, vl); + + result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl); + result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl); + + vuint8m1_t result0_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, vl); + vuint8m1_t result1_n = __riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, vl); + + __riscv_vse8_v_u8m1(p_dst_iter, result0_n, vl); + p_dst_iter += stride; + __riscv_vse8_v_u8m1(p_dst_iter, result1_n, vl); + p_dst_iter += stride; + } + + p_src_iter = p_src_begin + vl; + p_dst_iter = p_dst_begin + vl; + count -= vl; + } + + __builtin_rvv_vsetvxrm(vxrm); +} + +void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset) +{ + if (weight == 1 && offset == 0 && log2_den == 0) + return; + + if (weight == 128) + { + h264_weight_128(p_block, stride, 16, height, log2_den, offset); + } + else + { + h264_weight_normal(p_block, stride, 16, height, log2_den, weight, offset); + } +} + +void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset) +{ + if (weight == 1 && offset == 0 && log2_den == 0) + return; + + if (weight == 128) + { + h264_weight_128(p_block, stride, 8, height, log2_den, offset); + } + else + { + h264_weight_normal(p_block, stride, 8, height, log2_den, weight, offset); + } +} + +void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset) +{ + if (weight == 1 && offset == 0 && log2_den == 0) + return; + + if (weight == 128) + { + h264_weight_128(p_block, stride, 4, height, log2_den, offset); + } + else + { + h264_weight_normal(p_block, stride, 4, height, log2_den, weight, offset); + } +} + +void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset) +{ + h264_biweight(p_dst, p_src, stride, 16, height, log2_den, weightd, weights, offset); +} + +void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset) +{ + + h264_biweight(p_dst, p_src, stride, 8, height, log2_den, weightd, weights, offset); +} + +void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, + int weights, int offset) +{ + + h264_biweight(p_dst, p_src, stride, 4, height, log2_den, weightd, weights, offset); +} +#endif diff --git a/libavcodec/riscv/h264_weighted_sum.h b/libavcodec/riscv/h264_weighted_sum.h new file mode 100644 index 0000000000..631d6df1fa --- /dev/null +++ b/libavcodec/riscv/h264_weighted_sum.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023 SiFive, Inc. All rights reserved. + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_RISCV_H264_WEIGHTED_SUM_H +#define AVCODEC_RISCV_H264_WEIGHTED_SUM_H +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <stddef.h> +#include "config.h" + +#if HAVE_INTRINSICS_RVV +typedef unsigned char pixel; + +void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset); +void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset); +void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride, + int height, int log2_den, int weight, int offset); + +void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, int weights, int offset); +void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, int weights, int offset); +void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t stride, + int height, int log2_den, int weightd, int weights, int offset); +#endif +#endif \ No newline at end of file -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".