h264dsp: Add vectorized implementation of DSP functions for RISC-V

Arnie Chang Tue, 09 May 2023 02:51:44 -0700

Optimize IDCT, inloop filtering, and weighed prediction using RISC-V intrinsics.
The performance is elvaluated using 720P videos.
Combine with previous optimizations(chroma and luma MC), the FPS is 2.08x 
faster than the scalar one,
while applying only previous optimizations resulted in a speedup of 1.49x.


Signed-off-by: Arnie Chang <arnie.ch...@sifive.com>
---
 libavcodec/h264dsp.c                   |   2 +
 libavcodec/h264dsp.h                   |   3 +-
 libavcodec/riscv/Makefile              |   4 +
 libavcodec/riscv/h264_dsp_init_riscv.c |  68 +++
 libavcodec/riscv/h264_idct.c           | 482 ++++++++++++++++++
 libavcodec/riscv/h264_idct.h           |  46 ++
 libavcodec/riscv/h264_inloop.c         | 669 +++++++++++++++++++++++++
 libavcodec/riscv/h264_inloop.h         |  47 ++
 libavcodec/riscv/h264_weighted_sum.c   | 273 ++++++++++
 libavcodec/riscv/h264_weighted_sum.h   |  47 ++
 10 files changed, 1640 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/riscv/h264_dsp_init_riscv.c
 create mode 100644 libavcodec/riscv/h264_idct.c
 create mode 100644 libavcodec/riscv/h264_idct.h
 create mode 100644 libavcodec/riscv/h264_inloop.c
 create mode 100644 libavcodec/riscv/h264_inloop.h
 create mode 100644 libavcodec/riscv/h264_weighted_sum.c
 create mode 100644 libavcodec/riscv/h264_weighted_sum.h

diff --git a/libavcodec/h264dsp.c b/libavcodec/h264dsp.c
index 4d2ee10bab..b6e45c15ef 100644
--- a/libavcodec/h264dsp.c
+++ b/libavcodec/h264dsp.c
@@ -164,5 +164,7 @@ av_cold void ff_h264dsp_init(H264DSPContext *c, const int 
bit_depth,
     ff_h264dsp_init_mips(c, bit_depth, chroma_format_idc);
 #elif ARCH_LOONGARCH
     ff_h264dsp_init_loongarch(c, bit_depth, chroma_format_idc);
+#elif ARCH_RISCV
+    ff_h264dsp_init_riscv(c, bit_depth, chroma_format_idc);
 #endif
 }
diff --git a/libavcodec/h264dsp.h b/libavcodec/h264dsp.h
index e0880c4d88..f2f8aa7e60 100644
--- a/libavcodec/h264dsp.h
+++ b/libavcodec/h264dsp.h
@@ -131,5 +131,6 @@ void ff_h264dsp_init_mips(H264DSPContext *c, const int 
bit_depth,
                           const int chroma_format_idc);
 void ff_h264dsp_init_loongarch(H264DSPContext *c, const int bit_depth,
                                const int chroma_format_idc);
-
+void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth,
+                           const int chroma_format_idc);
 #endif /* AVCODEC_H264DSP_H */
diff --git a/libavcodec/riscv/Makefile b/libavcodec/riscv/Makefile
index 088efa3b1e..4d54bf35e9 100644
--- a/libavcodec/riscv/Makefile
+++ b/libavcodec/riscv/Makefile
@@ -24,3 +24,7 @@ OBJS-$(CONFIG_H264CHROMA) += riscv/h264_chroma_init_riscv.o
 RVV-OBJS-$(CONFIG_H264CHROMA) += riscv/h264_mc_chroma.o
 OBJS-$(CONFIG_H264QPEL) += riscv/h264_qpel_init_riscv.o
 RVV-OBJS-$(CONFIG_H264QPEL) += riscv/h264_mc_luma.o
+OBJS-$(CONFIG_H264DSP) += riscv/h264_dsp_init_riscv.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_weighted_sum.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_inloop.o
+RVV-OBJS-$(CONFIG_H264DSP) += riscv/h264_idct.o
diff --git a/libavcodec/riscv/h264_dsp_init_riscv.c 
b/libavcodec/riscv/h264_dsp_init_riscv.c
new file mode 100644
index 0000000000..7d41aa98a5
--- /dev/null
+++ b/libavcodec/riscv/h264_dsp_init_riscv.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavcodec/h264dsp.h"
+#include "config.h"
+#include "h264_inloop.h"
+#include "h264_weighted_sum.h"
+#include "h264_idct.h"
+
+av_cold void ff_h264dsp_init_riscv(H264DSPContext *c, const int bit_depth, 
const int chroma_format_idc)
+{
+#if HAVE_INTRINSICS_RVV
+    if (bit_depth == 8) {
+        c->h264_v_loop_filter_luma   = h264_v_loop_filter_luma_8_rvv;
+        c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_8_rvv;
+
+        c->h264_h_loop_filter_luma   = h264_h_loop_filter_luma_8_rvv;
+        c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_8_rvv;
+
+        c->h264_v_loop_filter_chroma = h264_v_loop_filter_chroma_8_rvv;
+        c->h264_v_loop_filter_chroma_intra = 
h264_v_loop_filter_chroma_intra_8_rvv;
+
+        if (chroma_format_idc <= 1) {
+            c->h264_h_loop_filter_chroma = h264_h_loop_filter_chroma_8_rvv;
+            c->h264_h_loop_filter_chroma_intra = 
h264_h_loop_filter_chroma_intra_8_rvv;
+            c->h264_h_loop_filter_chroma_mbaff_intra = 
h264_h_loop_filter_chroma_mbaff_intra_8_rvv;
+        }
+
+        c->weight_h264_pixels_tab[0] = weight_h264_pixels_16_8_rvv;
+        c->weight_h264_pixels_tab[1] = weight_h264_pixels_8_8_rvv;
+        c->weight_h264_pixels_tab[2] = weight_h264_pixels_4_8_rvv;
+
+        c->biweight_h264_pixels_tab[0]= biweight_h264_pixels_16_8_rvv;
+        c->biweight_h264_pixels_tab[1]= biweight_h264_pixels_8_8_rvv;
+        c->biweight_h264_pixels_tab[2]= biweight_h264_pixels_4_8_rvv;
+
+        c->h264_idct_add        = h264_idct_add_8_rvv;
+        c->h264_idct_dc_add     = h264_idct_dc_add_8_rvv;
+        c->h264_idct_add16      = h264_idct_add16_8_rvv;
+        c->h264_idct_add16intra = h264_idct_add16_intra_8_rvv;
+        if (chroma_format_idc <= 1)
+            c->h264_idct_add8   = h264_idct_add8_8_rvv;
+        c->h264_idct8_add       = h264_idct8_add_8_rvv;
+        c->h264_idct8_dc_add    = h264_idct8_dc_add_8_rvv;
+        c->h264_idct8_add4      = h264_idct8_add4_8_rvv;
+    }
+#endif
+}
diff --git a/libavcodec/riscv/h264_idct.c b/libavcodec/riscv/h264_idct.c
new file mode 100644
index 0000000000..3ef6b74421
--- /dev/null
+++ b/libavcodec/riscv/h264_idct.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_idct.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+static const uint8_t scan8[16 * 3 + 3] =
+{
+    4 +  1 * 8, 5 +  1 * 8, 4 +  2 * 8, 5 +  2 * 8,
+    6 +  1 * 8, 7 +  1 * 8, 6 +  2 * 8, 7 +  2 * 8,
+    4 +  3 * 8, 5 +  3 * 8, 4 +  4 * 8, 5 +  4 * 8,
+    6 +  3 * 8, 7 +  3 * 8, 6 +  4 * 8, 7 +  4 * 8,
+    4 +  6 * 8, 5 +  6 * 8, 4 +  7 * 8, 5 +  7 * 8,
+    6 +  6 * 8, 7 +  6 * 8, 6 +  7 * 8, 7 +  7 * 8,
+    4 +  8 * 8, 5 +  8 * 8, 4 +  9 * 8, 5 +  9 * 8,
+    6 +  8 * 8, 7 +  8 * 8, 6 +  9 * 8, 7 +  9 * 8,
+    4 + 11 * 8, 5 + 11 * 8, 4 + 12 * 8, 5 + 12 * 8,
+    6 + 11 * 8, 7 + 11 * 8, 6 + 12 * 8, 7 + 12 * 8,
+    4 + 13 * 8, 5 + 13 * 8, 4 + 14 * 8, 5 + 14 * 8,
+    6 + 13 * 8, 7 + 13 * 8, 6 + 14 * 8, 7 + 14 * 8,
+    0 +  0 * 8, 0 +  5 * 8, 0 + 10 * 8
+};
+
+void h264_idct_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+    int16_t temp[16];
+    int vl = __riscv_vsetvl_e16m1(4);
+
+    p_block[0] += 32;
+
+    vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl);
+    vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 4, vl);
+    vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 8, vl);
+    vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 12, vl);
+
+    // 1-D row idct
+    vint16m1_t z0 = __riscv_vadd_vv_i16m1(row0, row2, vl);
+    vint16m1_t z1 = __riscv_vsub_vv_i16m1(row0, row2, vl);
+    vint16m1_t z2 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+    z2 = __riscv_vsub_vv_i16m1(z2, row3, vl);
+    vint16m1_t z3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+    z3 = __riscv_vadd_vv_i16m1(z3, row1, vl);
+
+    vint16m1_t result0 = __riscv_vadd_vv_i16m1(z0, z3, vl);
+    vint16m1_t result1 = __riscv_vadd_vv_i16m1(z1, z2, vl);
+    vint16m1_t result2 = __riscv_vsub_vv_i16m1(z1, z2, vl);
+    vint16m1_t result3 = __riscv_vsub_vv_i16m1(z0, z3, vl);
+
+    // transpose
+    __riscv_vse16_v_i16m1(&temp[0], result0, vl);
+    __riscv_vse16_v_i16m1(&temp[4], result1, vl);
+    __riscv_vse16_v_i16m1(&temp[8], result2, vl);
+    __riscv_vse16_v_i16m1(&temp[12], result3, vl);
+    __riscv_vlseg4e16_v_i16m1(&row0, &row1, &row2, &row3, &temp[0], vl);
+
+    // 1-D column idct
+    z0 = __riscv_vadd_vv_i16m1(row0, row2, vl);
+    z1 = __riscv_vsub_vv_i16m1(row0, row2, vl);
+    z2 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+    z2 = __riscv_vsub_vv_i16m1(z2, row3, vl);
+    z3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+    z3 = __riscv_vadd_vv_i16m1(z3, row1, vl);
+
+    result0 = __riscv_vadd_vv_i16m1(z0, z3, vl);
+    result1 = __riscv_vadd_vv_i16m1(z1, z2, vl);
+    result2 = __riscv_vsub_vv_i16m1(z1, z2, vl);
+    result3 = __riscv_vsub_vv_i16m1(z0, z3, vl);
+
+    result0 = __riscv_vsra_vx_i16m1(result0, 6, vl);
+    result1 = __riscv_vsra_vx_i16m1(result1, 6, vl);
+    result2 = __riscv_vsra_vx_i16m1(result2, 6, vl);
+    result3 = __riscv_vsra_vx_i16m1(result3, 6, vl);
+
+    vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl);
+    vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl);
+    vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl);
+    vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl);
+
+    vint16m1_t dst0_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl));
+    vint16m1_t dst1_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl));
+    vint16m1_t dst2_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl));
+    vint16m1_t dst3_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl));
+
+    result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl);
+    result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl);
+    result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl);
+    result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl);
+
+    result0 = __riscv_vmax_vx_i16m1(result0, 0, vl);
+    result1 = __riscv_vmax_vx_i16m1(result1, 0, vl);
+    result2 = __riscv_vmax_vx_i16m1(result2, 0, vl);
+    result3 = __riscv_vmax_vx_i16m1(result3, 0, vl);
+
+    vuint8mf2_t result0_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl);
+    vuint8mf2_t result1_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl);
+    vuint8mf2_t result2_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl);
+    vuint8mf2_t result3_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl);
+
+    __riscv_vse8_v_u8mf2(p_dst, result0_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl);
+
+    memset(p_block, 0, sizeof(int16_t) * 16);
+}
+
+void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+    int vl = __riscv_vsetvl_e16m1(4);
+
+    int dc = (p_block[0] + 32) >> 6;
+
+    if (dc > 255)
+        dc = 255;
+
+    if (dc < -255)
+        dc = -255;
+
+    p_block[0] = 0;
+
+    vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst, vl);
+    vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst + stride, vl);
+    vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst + stride * 2, vl);
+    vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst + stride * 3, vl);
+
+    if (dc >= 0)
+    {
+        dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl);
+        dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl);
+        dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl);
+        dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl);
+    }
+    else
+    {
+        dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl);
+        dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl);
+        dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl);
+        dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl);
+    }
+
+    __riscv_vse8_v_u8m1(p_dst, dst0, vl);
+    __riscv_vse8_v_u8m1(p_dst + stride, dst1, vl);
+    __riscv_vse8_v_u8m1(p_dst + stride * 2, dst2, vl);
+    __riscv_vse8_v_u8m1(p_dst + stride * 3, dst3, vl);
+}
+
+void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t 
*p_block, int stride,
+                           const uint8_t nnzc[5 * 8])
+{
+    for(int i = 0; i < 16; i++)
+    {
+        int nnz = nnzc[scan8[i]];
+
+        if(nnz)
+        {
+            if(nnz==1 && p_block[i*16])
+                h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i 
* 16 * sizeof(pixel), stride);
+            else
+                h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 
16 * sizeof(pixel), stride);
+        }
+    }
+}
+
+void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, 
int16_t *p_block, int stride,
+                                 const uint8_t nnzc[5 * 8])
+{
+    for(int i = 0; i < 16; i++)
+    {
+        if(nnzc[scan8[i]])
+            h264_idct_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 * 
sizeof(pixel), stride);
+        else if(p_block[i*16])
+            h264_idct_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 16 
* sizeof(pixel), stride);
+    }
+}
+
+void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t 
*p_block, int stride,
+                          const uint8_t nnzc[15*8])
+{
+    for(int j = 1; j < 3; j++)
+    {
+        for(int i = j * 16; i < j * 16 + 4; i++)
+        {
+            if(nnzc[scan8[i]])
+                h264_idct_add_8_rvv(p_dst[j - 1] + p_block_offset[i], p_block 
+ i * 16 * sizeof(pixel), stride);
+            else if(p_block[i * 16])
+                h264_idct_dc_add_8_rvv(p_dst[j - 1] + p_block_offset[i], 
p_block + i * 16 * sizeof(pixel), stride);
+        }
+    }
+}
+
+void h264_idct8_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+    int16_t temp[64];
+    int vl = __riscv_vsetvl_e16m1(8);
+
+    p_block[0] += 32;
+
+    vint16m1_t row0 = __riscv_vle16_v_i16m1(p_block, vl);
+    vint16m1_t row1 = __riscv_vle16_v_i16m1(p_block + 8, vl);
+    vint16m1_t row2 = __riscv_vle16_v_i16m1(p_block + 16, vl);
+    vint16m1_t row3 = __riscv_vle16_v_i16m1(p_block + 24, vl);
+    vint16m1_t row4 = __riscv_vle16_v_i16m1(p_block + 32, vl);
+    vint16m1_t row5 = __riscv_vle16_v_i16m1(p_block + 40, vl);
+    vint16m1_t row6 = __riscv_vle16_v_i16m1(p_block + 48, vl);
+    vint16m1_t row7 = __riscv_vle16_v_i16m1(p_block + 56, vl);
+
+    // 1-D row idct
+    vint16m1_t a0 = __riscv_vadd_vv_i16m1(row0, row4, vl);
+    vint16m1_t a2 = __riscv_vsub_vv_i16m1(row0, row4, vl);
+    vint16m1_t a4 = __riscv_vsra_vx_i16m1(row2, 1, vl);
+    a4 = __riscv_vsub_vv_i16m1(a4, row6, vl);
+    vint16m1_t a6 = __riscv_vsra_vx_i16m1(row6, 1, vl);
+    a6 = __riscv_vadd_vv_i16m1(row2, a6, vl);
+
+    vint16m1_t b0 = __riscv_vadd_vv_i16m1(a0, a6, vl);
+    vint16m1_t b2 = __riscv_vadd_vv_i16m1(a2, a4, vl);
+    vint16m1_t b4 = __riscv_vsub_vv_i16m1(a2, a4, vl);
+    vint16m1_t b6 = __riscv_vsub_vv_i16m1(a0, a6, vl);
+
+    vint16m1_t a1 = __riscv_vsra_vx_i16m1(row7, 1, vl);
+    a1 = __riscv_vsub_vv_i16m1(row5, a1, vl);
+    a1 = __riscv_vsub_vv_i16m1(a1, row3, vl);
+    a1 = __riscv_vsub_vv_i16m1(a1, row7, vl);
+    vint16m1_t a3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+    a3 = __riscv_vsub_vv_i16m1(row7, a3, vl);
+    a3 = __riscv_vadd_vv_i16m1(a3, row1, vl);
+    a3 = __riscv_vsub_vv_i16m1(a3, row3, vl);
+    vint16m1_t a5 = __riscv_vsra_vx_i16m1(row5, 1, vl);
+    a5 = __riscv_vsub_vv_i16m1(a5, row1, vl);
+    a5 = __riscv_vadd_vv_i16m1(a5, row7, vl);
+    a5 = __riscv_vadd_vv_i16m1(a5, row5, vl);
+    vint16m1_t a7 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row3, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row5, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row1, vl);
+
+    vint16m1_t b1 = __riscv_vsra_vx_i16m1(a7, 2, vl);
+    b1 = __riscv_vadd_vv_i16m1(b1, a1, vl);
+    vint16m1_t b3 = __riscv_vsra_vx_i16m1(a5, 2, vl);
+    b3 = __riscv_vadd_vv_i16m1(b3, a3, vl);
+    vint16m1_t b5 = __riscv_vsra_vx_i16m1(a3, 2, vl);
+    b5 = __riscv_vsub_vv_i16m1(b5, a5, vl);
+    vint16m1_t b7 = __riscv_vsra_vx_i16m1(a1, 2, vl);
+    b7 = __riscv_vsub_vv_i16m1(a7, b7, vl);
+
+    vint16m1_t result0 = __riscv_vadd_vv_i16m1(b0, b7, vl);
+    vint16m1_t result7 = __riscv_vsub_vv_i16m1(b0, b7, vl);
+    vint16m1_t result1 = __riscv_vadd_vv_i16m1(b2, b5, vl);
+    vint16m1_t result6 = __riscv_vsub_vv_i16m1(b2, b5, vl);
+    vint16m1_t result2 = __riscv_vadd_vv_i16m1(b4, b3, vl);
+    vint16m1_t result5 = __riscv_vsub_vv_i16m1(b4, b3, vl);
+    vint16m1_t result3 = __riscv_vadd_vv_i16m1(b6, b1, vl);
+    vint16m1_t result4 = __riscv_vsub_vv_i16m1(b6, b1, vl);
+
+    // transpose
+    __riscv_vse16_v_i16m1(&temp[0], result0, vl);
+    __riscv_vse16_v_i16m1(&temp[8], result1, vl);
+    __riscv_vse16_v_i16m1(&temp[16], result2, vl);
+    __riscv_vse16_v_i16m1(&temp[24], result3, vl);
+    __riscv_vse16_v_i16m1(&temp[32], result4, vl);
+    __riscv_vse16_v_i16m1(&temp[40], result5, vl);
+    __riscv_vse16_v_i16m1(&temp[48], result6, vl);
+    __riscv_vse16_v_i16m1(&temp[56], result7, vl);
+
+    __riscv_vlseg8e16_v_i16m1(&row0, &row1, &row2, &row3, &row4, &row5, &row6, 
&row7, &temp[0], vl);
+
+    // 1-D column idct
+    a0 = __riscv_vadd_vv_i16m1(row0, row4, vl);
+    a2 = __riscv_vsub_vv_i16m1(row0, row4, vl);
+    a4 = __riscv_vsra_vx_i16m1(row2, 1, vl);
+    a4 = __riscv_vsub_vv_i16m1(a4, row6, vl);
+    a6 = __riscv_vsra_vx_i16m1(row6, 1, vl);
+    a6 = __riscv_vadd_vv_i16m1(row2, a6, vl);
+
+    b0 = __riscv_vadd_vv_i16m1(a0, a6, vl);
+    b2 = __riscv_vadd_vv_i16m1(a2, a4, vl);
+    b4 = __riscv_vsub_vv_i16m1(a2, a4, vl);
+    b6 = __riscv_vsub_vv_i16m1(a0, a6, vl);
+
+    a1 = __riscv_vsra_vx_i16m1(row7, 1, vl);
+    a1 = __riscv_vsub_vv_i16m1(row5, a1, vl);
+    a1 = __riscv_vsub_vv_i16m1(a1, row3, vl);
+    a1 = __riscv_vsub_vv_i16m1(a1, row7, vl);
+    a3 = __riscv_vsra_vx_i16m1(row3, 1, vl);
+    a3 = __riscv_vsub_vv_i16m1(row7, a3, vl);
+    a3 = __riscv_vadd_vv_i16m1(a3, row1, vl);
+    a3 = __riscv_vsub_vv_i16m1(a3, row3, vl);
+    a5 = __riscv_vsra_vx_i16m1(row5, 1, vl);
+    a5 = __riscv_vsub_vv_i16m1(a5, row1, vl);
+    a5 = __riscv_vadd_vv_i16m1(a5, row7, vl);
+    a5 = __riscv_vadd_vv_i16m1(a5, row5, vl);
+    a7 = __riscv_vsra_vx_i16m1(row1, 1, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row3, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row5, vl);
+    a7 = __riscv_vadd_vv_i16m1(a7, row1, vl);
+
+    b1 = __riscv_vsra_vx_i16m1(a7, 2, vl);
+    b1 = __riscv_vadd_vv_i16m1(b1, a1, vl);
+    b3 = __riscv_vsra_vx_i16m1(a5, 2, vl);
+    b3 = __riscv_vadd_vv_i16m1(b3, a3, vl);
+    b5 = __riscv_vsra_vx_i16m1(a3, 2, vl);
+    b5 = __riscv_vsub_vv_i16m1(b5, a5, vl);
+    b7 = __riscv_vsra_vx_i16m1(a1, 2, vl);
+    b7 = __riscv_vsub_vv_i16m1(a7, b7, vl);
+
+    result0 = __riscv_vadd_vv_i16m1(b0, b7, vl);
+    result1 = __riscv_vadd_vv_i16m1(b2, b5, vl);
+    result2 = __riscv_vadd_vv_i16m1(b4, b3, vl);
+    result3 = __riscv_vadd_vv_i16m1(b6, b1, vl);
+    result4 = __riscv_vsub_vv_i16m1(b6, b1, vl);
+    result5 = __riscv_vsub_vv_i16m1(b4, b3, vl);
+    result6 = __riscv_vsub_vv_i16m1(b2, b5, vl);
+    result7 = __riscv_vsub_vv_i16m1(b0, b7, vl);
+
+    // normalize and write to destination
+    result0 = __riscv_vsra_vx_i16m1(result0, 6, vl);
+    result1 = __riscv_vsra_vx_i16m1(result1, 6, vl);
+    result2 = __riscv_vsra_vx_i16m1(result2, 6, vl);
+    result3 = __riscv_vsra_vx_i16m1(result3, 6, vl);
+    result4 = __riscv_vsra_vx_i16m1(result4, 6, vl);
+    result5 = __riscv_vsra_vx_i16m1(result5, 6, vl);
+    result6 = __riscv_vsra_vx_i16m1(result6, 6, vl);
+    result7 = __riscv_vsra_vx_i16m1(result7, 6, vl);
+
+    vuint8mf2_t dst0 = __riscv_vle8_v_u8mf2(p_dst, vl);
+    vuint8mf2_t dst1 = __riscv_vle8_v_u8mf2(p_dst + stride, vl);
+    vuint8mf2_t dst2 = __riscv_vle8_v_u8mf2(p_dst + stride * 2, vl);
+    vuint8mf2_t dst3 = __riscv_vle8_v_u8mf2(p_dst + stride * 3, vl);
+    vuint8mf2_t dst4 = __riscv_vle8_v_u8mf2(p_dst + stride * 4, vl);
+    vuint8mf2_t dst5 = __riscv_vle8_v_u8mf2(p_dst + stride * 5, vl);
+    vuint8mf2_t dst6 = __riscv_vle8_v_u8mf2(p_dst + stride * 6, vl);
+    vuint8mf2_t dst7 = __riscv_vle8_v_u8mf2(p_dst + stride * 7, vl);
+
+    vint16m1_t dst0_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst0, vl));
+    vint16m1_t dst1_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst1, vl));
+    vint16m1_t dst2_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst2, vl));
+    vint16m1_t dst3_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst3, vl));
+    vint16m1_t dst4_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst4, vl));
+    vint16m1_t dst5_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst5, vl));
+    vint16m1_t dst6_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst6, vl));
+    vint16m1_t dst7_w = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwcvtu_x_x_v_u16m1(dst7, vl));
+
+    result0 = __riscv_vadd_vv_i16m1(result0, dst0_w, vl);
+    result1 = __riscv_vadd_vv_i16m1(result1, dst1_w, vl);
+    result2 = __riscv_vadd_vv_i16m1(result2, dst2_w, vl);
+    result3 = __riscv_vadd_vv_i16m1(result3, dst3_w, vl);
+    result4 = __riscv_vadd_vv_i16m1(result4, dst4_w, vl);
+    result5 = __riscv_vadd_vv_i16m1(result5, dst5_w, vl);
+    result6 = __riscv_vadd_vv_i16m1(result6, dst6_w, vl);
+    result7 = __riscv_vadd_vv_i16m1(result7, dst7_w, vl);
+
+    result0 = __riscv_vmax_vx_i16m1(result0, 0, vl);
+    result1 = __riscv_vmax_vx_i16m1(result1, 0, vl);
+    result2 = __riscv_vmax_vx_i16m1(result2, 0, vl);
+    result3 = __riscv_vmax_vx_i16m1(result3, 0, vl);
+    result4 = __riscv_vmax_vx_i16m1(result4, 0, vl);
+    result5 = __riscv_vmax_vx_i16m1(result5, 0, vl);
+    result6 = __riscv_vmax_vx_i16m1(result6, 0, vl);
+    result7 = __riscv_vmax_vx_i16m1(result7, 0, vl);
+
+    vuint8mf2_t result0_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result0), 0, vl);
+    vuint8mf2_t result1_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result1), 0, vl);
+    vuint8mf2_t result2_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result2), 0, vl);
+    vuint8mf2_t result3_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result3), 0, vl);
+    vuint8mf2_t result4_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result4), 0, vl);
+    vuint8mf2_t result5_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result5), 0, vl);
+    vuint8mf2_t result6_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result6), 0, vl);
+    vuint8mf2_t result7_n = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(result7), 0, vl);
+
+    __riscv_vse8_v_u8mf2(p_dst, result0_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride, result1_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 2, result2_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 3, result3_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 4, result4_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 5, result5_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 6, result6_n, vl);
+    __riscv_vse8_v_u8mf2(p_dst + stride * 7, result7_n, vl);
+
+    memset(p_block, 0, sizeof(int16_t) * 64);
+}
+
+void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride)
+{
+    int count = 8;
+    uint8_t *p_dst_iter = p_dst;
+
+    int dc = (p_block[0] + 32) >> 6;
+
+    if (dc > 255)
+        dc = 255;
+
+    if (dc < -255)
+        dc = -255;
+
+    p_block[0] = 0;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e16m1(8);
+
+        vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+        vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+        vuint8m1_t dst2 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 2, vl);
+        vuint8m1_t dst3 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 3, vl);
+        vuint8m1_t dst4 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 4, vl);
+        vuint8m1_t dst5 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 5, vl);
+        vuint8m1_t dst6 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 6, vl);
+        vuint8m1_t dst7 = __riscv_vle8_v_u8m1(p_dst_iter + stride * 7, vl);
+
+        if (dc >= 0)
+        {
+            dst0 = __riscv_vsaddu_vx_u8m1(dst0, dc, vl);
+            dst1 = __riscv_vsaddu_vx_u8m1(dst1, dc, vl);
+            dst2 = __riscv_vsaddu_vx_u8m1(dst2, dc, vl);
+            dst3 = __riscv_vsaddu_vx_u8m1(dst3, dc, vl);
+            dst4 = __riscv_vsaddu_vx_u8m1(dst4, dc, vl);
+            dst5 = __riscv_vsaddu_vx_u8m1(dst5, dc, vl);
+            dst6 = __riscv_vsaddu_vx_u8m1(dst6, dc, vl);
+            dst7 = __riscv_vsaddu_vx_u8m1(dst7, dc, vl);
+        }
+        else
+        {
+            dst0 = __riscv_vssubu_vx_u8m1(dst0, -dc, vl);
+            dst1 = __riscv_vssubu_vx_u8m1(dst1, -dc, vl);
+            dst2 = __riscv_vssubu_vx_u8m1(dst2, -dc, vl);
+            dst3 = __riscv_vssubu_vx_u8m1(dst3, -dc, vl);
+            dst4 = __riscv_vssubu_vx_u8m1(dst4, -dc, vl);
+            dst5 = __riscv_vssubu_vx_u8m1(dst5, -dc, vl);
+            dst6 = __riscv_vssubu_vx_u8m1(dst6, -dc, vl);
+            dst7 = __riscv_vssubu_vx_u8m1(dst7, -dc, vl);
+        }
+
+        __riscv_vse8_v_u8m1(p_dst_iter, dst0, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride, dst1, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 2, dst2, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 3, dst3, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 4, dst4, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 5, dst5, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 6, dst6, vl);
+        __riscv_vse8_v_u8m1(p_dst_iter + stride * 7, dst7, vl);
+
+        count -= vl;
+        p_dst_iter += vl;
+    }
+}
+
+void h264_idct8_add4_8_rvv(uint8_t *p_dst, const int *p_block_offset,
+                           int16_t *p_block, int stride, const uint8_t nnzc[5 
* 8])
+{
+    for(int i = 0; i < 16; i += 4)
+    {
+        int nnz = nnzc[scan8[i]];
+
+        if(nnz)
+        {
+            if(nnz == 1 && p_block[i * 16])
+                h264_idct8_dc_add_8_rvv(p_dst + p_block_offset[i], p_block + i 
* 16 * sizeof(pixel), stride);
+            else
+                h264_idct8_add_8_rvv(p_dst + p_block_offset[i], p_block + i * 
16 * sizeof(pixel), stride);
+        }
+    }
+}
+#endif
+
diff --git a/libavcodec/riscv/h264_idct.h b/libavcodec/riscv/h264_idct.h
new file mode 100644
index 0000000000..4b942c35f7
--- /dev/null
+++ b/libavcodec/riscv/h264_idct.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_IDCT_H
+#define AVCODEC_RISCV_H264_IDCT_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void h264_idct_add_8_rvv(uint8_t *dst, int16_t *block, int stride);
+void h264_idct_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride);
+void h264_idct_add16_8_rvv(uint8_t *p_dst, const int *p_block_offset, int16_t 
*p_block, int stride,
+                           const uint8_t nnzc[5 * 8]);
+void h264_idct_add16_intra_8_rvv(uint8_t *p_dst, const int *p_block_offset, 
int16_t *p_block, int stride,
+                                 const uint8_t nnzc[5 * 8]);
+void h264_idct_add8_8_rvv(uint8_t **p_dst, const int *p_block_offset, int16_t 
*p_block, int stride,
+                          const uint8_t nnzc[15*8]);
+void h264_idct8_add_8_rvv(uint8_t *_dst, int16_t *_block, int stride);
+void h264_idct8_dc_add_8_rvv(uint8_t *p_dst, int16_t *p_block, int stride);
+void h264_idct8_add4_8_rvv(uint8_t *dst, const int *block_offset,
+                           int16_t *block, int stride, const uint8_t nnzc[5 * 
8]);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_inloop.c b/libavcodec/riscv/h264_inloop.c
new file mode 100644
index 0000000000..d14cf4dd7a
--- /dev/null
+++ b/libavcodec/riscv/h264_inloop.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_inloop.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+__attribute__((always_inline)) static void extend_tc0_2(vint8mf2_t *p_tc0_i8, 
int8_t *p_tc0, size_t start, int vl)
+{
+    if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3])
+    {
+        *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl);
+    }
+    else
+    {
+        const uint8_t tc02_index[] = {0, 0, 1, 1, 2, 2, 3, 3};
+        vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4);
+        vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc02_index + start, vl);
+        *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl);
+    }
+}
+
+__attribute__((always_inline)) static void extend_tc0(vint8mf2_t *p_tc0_i8, 
int8_t *p_tc0, size_t start, int vl)
+{
+    if (p_tc0[0] == p_tc0[1] && p_tc0[1] == p_tc0[2] && p_tc0[2] == p_tc0[3])
+    {
+        *p_tc0_i8 = __riscv_vmv_v_x_i8mf2(p_tc0[0], vl);
+    }
+    else
+    {
+        const uint8_t tc01_index[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 
3, 3, 3};
+        vint8mf2_t tc8 = __riscv_vle8_v_i8mf2(p_tc0, 4);
+        vuint8mf2_t v_index = __riscv_vle8_v_u8mf2(tc01_index + start, vl);
+        *p_tc0_i8 = __riscv_vrgather_vv_i8mf2(tc8, v_index, vl);
+    }
+}
+
+__attribute__((always_inline)) static void luma_core(vuint8mf2_t *p_p1_dst, 
vuint8mf2_t *p_p0_dst,
+                                                     vuint8mf2_t *p_q0_dst, 
vuint8mf2_t *p_q1_dst,
+                                                     vuint8mf2_t p2, 
vuint8mf2_t p1, vuint8mf2_t p0,
+                                                     vuint8mf2_t q0, 
vuint8mf2_t q1, vuint8mf2_t q2,
+                                                     vint8mf2_t tc8, int 
alpha, int beta, int vl)
+{
+    vint16m1_t p2_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl));
+    vint16m1_t p1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+    vint16m1_t p0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+    vint16m1_t q0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+    vint16m1_t q1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+    vint16m1_t q2_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl));
+
+    vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+    vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+    vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+    vint16m1_t sub_p2_p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl);
+    vint16m1_t sub_q2_q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl);
+
+    vint16m1_t minus_sub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl);
+    vint16m1_t minus_sub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl);
+    vint16m1_t minus_sub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl);
+    vint16m1_t minus_sub_p2_p0 = __riscv_vrsub_vx_i16m1(sub_p2_p0, 0, vl);
+    vint16m1_t minus_sub_q2_q0 = __riscv_vrsub_vx_i16m1(sub_q2_q0, 0, vl);
+
+    vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, minus_sub_q0_p0, 
vl);
+    vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, minus_sub_p1_p0, 
vl);
+    vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, minus_sub_q1_q0, 
vl);
+    vint16m1_t abs_diff2 = __riscv_vmax_vv_i16m1(sub_p2_p0, minus_sub_p2_p0, 
vl);
+    vint16m1_t abs_diff3 = __riscv_vmax_vv_i16m1(sub_q2_q0, minus_sub_q2_q0, 
vl);
+
+    vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl);
+    vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl);
+    vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abs_diff11, alpha, vl);
+    vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abs_diff12, beta, vl);
+    vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abs_diff13, beta, vl);
+    vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abs_diff2, beta, vl);
+    vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abs_diff3, beta, vl);
+
+    vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond_mask, vl);
+    cond1 = __riscv_vmand_mm_b16(cond1, cond12, vl);
+    cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl);
+    cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl);
+    cond3 = __riscv_vmand_mm_b16(cond3, cond1, vl);
+
+    // p1
+    vint16m1_t sum_p0_q0 = __riscv_vaadd_vv_i16m1(p0_i16, q0_i16, vl);
+    vint16m1_t p1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, p2_i16, vl);
+    p1_new_i16 = __riscv_vsra_vx_i16m1(p1_new_i16, 1, vl);
+    vint16m1_t p1_new_upper = __riscv_vadd_vv_i16m1(p1_i16, tc, vl);
+    vint16m1_t p1_new_lower = __riscv_vsub_vv_i16m1(p1_i16, tc, vl);
+    p1_new_i16 = __riscv_vmax_vv_i16m1(p1_new_i16, p1_new_lower, vl);
+    p1_new_i16 = __riscv_vmin_vv_i16m1(p1_new_i16, p1_new_upper, vl);
+    *p_p1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond2, p1, 
__riscv_vreinterpret_v_i16m1_u16m1(p1_new_i16), vl);
+    vint16m1_t tc_adjust = __riscv_vadc_vxm_i16m1(tc, 0, cond2, vl);
+
+    // q1
+    vint16m1_t q1_new_i16 = __riscv_vadd_vv_i16m1(sum_p0_q0, q2_i16, vl);
+    q1_new_i16 = __riscv_vsra_vx_i16m1(q1_new_i16, 1, vl);
+    vint16m1_t q1_new_upper = __riscv_vadd_vv_i16m1(q1_i16, tc, vl);
+    vint16m1_t q1_new_lower = __riscv_vsub_vv_i16m1(q1_i16, tc, vl);
+    q1_new_i16 = __riscv_vmax_vv_i16m1(q1_new_i16, q1_new_lower, vl);
+    q1_new_i16 = __riscv_vmin_vv_i16m1(q1_new_i16, q1_new_upper, vl);
+    *p_q1_dst = __riscv_vncvt_x_x_w_u8mf2_mu(cond3, q1, 
__riscv_vreinterpret_v_i16m1_u16m1(q1_new_i16), vl);
+    tc_adjust = __riscv_vadc_vxm_i16m1(tc_adjust, 0, cond3, vl);
+
+    // p0, q0
+    vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl);
+    vint16m1_t delta_i16 = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl);
+    delta_i16 = __riscv_vadd_vv_i16m1(delta_i16, sub_p1_q1, vl);
+    delta_i16 = __riscv_vssra_vx_i16m1(delta_i16, 3, vl);
+    delta_i16 = __riscv_vmin_vv_i16m1(delta_i16, tc_adjust, vl);
+    delta_i16 = __riscv_vmax_vv_i16m1(delta_i16, 
__riscv_vrsub_vx_i16m1(tc_adjust, 0, vl), vl);
+
+    vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta_i16, vl);
+    vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta_i16, vl);
+    p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl);
+    q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl);
+
+    *p_p0_dst= __riscv_vnclipu_wx_u8mf2_mu(cond1, p0, 
__riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl);
+    *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond1, q0, 
__riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_luma(uint8_t *p_pix, 
ptrdiff_t stride,
+                                                              int width, int 
alpha, int beta, int8_t *p_tc0)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+    int tc_offset = 0;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vint8mf2_t tc8;
+        extend_tc0(&tc8, p_tc0, tc_offset, vl);
+
+        vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl);
+        vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+        vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+        vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+        vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+        vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl);
+
+        vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst;
+        luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, 
tc8, alpha, beta, vl);
+
+        __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl);
+
+        count -= vl;
+        tc_offset = tc_offset + vl;
+        p_iter = p_iter + vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_luma(uint8_t *p_pix, 
ptrdiff_t stride,
+                                                              int width, int 
alpha, int beta, int8_t *p_tc0)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+    int tc_offset = 0;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vint8mf2_t tc8;
+        extend_tc0(&tc8, p_tc0, tc_offset, vl);
+
+        vuint8mf2_t p2, p1, p0, q0, q1, q2;
+        __riscv_vlsseg6e8_v_u8mf2(&p2, &p1, &p0, &q0, &q1, &q2, p_iter - 3, 
stride, width);
+
+        vuint8mf2_t p1_dst, p0_dst, q0_dst, q1_dst;
+        luma_core(&p1_dst, &p0_dst, &q0_dst, &q1_dst, p2, p1, p0, q0, q1, q2, 
tc8, alpha, beta, vl);
+
+        __riscv_vssseg4e8_v_u8mf2(p_iter - 2, stride, p1_dst, p0_dst, q0_dst, 
q1_dst, 16);
+
+        count -= vl;
+        tc_offset = tc_offset + vl;
+        p_iter = p_iter + vl * stride;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void chroma_core(vuint8mf2_t *p_p0_dst, 
vuint8mf2_t *p_q0_dst,
+                                                       vuint8mf2_t p1, 
vuint8mf2_t p0, vuint8mf2_t q0,
+                                                       vuint8mf2_t q1, 
vint8mf2_t tc8, int alpha,
+                                                       int beta, int vl)
+{
+    vint16m1_t p1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+    vint16m1_t p0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+    vint16m1_t q0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+    vint16m1_t q1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+
+    vint16m1_t sub_q0_p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+    vint16m1_t sub_p1_p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+    vint16m1_t sub_q1_q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+
+    vint16m1_t rsub_q0_p0 = __riscv_vrsub_vx_i16m1(sub_q0_p0, 0, vl);
+    vint16m1_t rsub_p1_p0 = __riscv_vrsub_vx_i16m1(sub_p1_p0, 0, vl);
+    vint16m1_t rsub_q1_q0 = __riscv_vrsub_vx_i16m1(sub_q1_q0, 0, vl);
+
+    vint16m1_t abs_diff11 = __riscv_vmax_vv_i16m1(sub_q0_p0, rsub_q0_p0, vl);
+    vint16m1_t abs_diff12 = __riscv_vmax_vv_i16m1(sub_p1_p0, rsub_p1_p0, vl);
+    vint16m1_t abs_diff13 = __riscv_vmax_vv_i16m1(sub_q1_q0, rsub_q1_q0, vl);
+
+    vint16m1_t tc = __riscv_vwadd_vx_i16m1(tc8, 0, vl);
+    vbool16_t cond_mask = __riscv_vmsge_vx_i16m1_b16(tc, 0, vl);
+    vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16_mu(cond_mask, cond_mask, 
abs_diff11, alpha, vl);
+    vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, 
abs_diff12, beta, vl);
+    vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, 
abs_diff13, beta, vl);
+
+    vint16m1_t sub_p1_q1 = __riscv_vsub_vv_i16m1(p1_i16, q1_i16, vl);
+    vint16m1_t delta = __riscv_vsll_vx_i16m1(sub_q0_p0, 2, vl);
+    delta = __riscv_vadd_vv_i16m1(delta, sub_p1_q1, vl);
+    delta = __riscv_vssra_vx_i16m1(delta, 3, vl);
+    delta = __riscv_vmin_vv_i16m1(delta, tc, vl);
+    delta = __riscv_vmax_vv_i16m1(delta, __riscv_vrsub_vx_i16m1(tc, 0, vl), 
vl);
+
+    vint16m1_t p0_new_i16 = __riscv_vadd_vv_i16m1(p0_i16, delta, vl);
+    vint16m1_t q0_new_i16 = __riscv_vsub_vv_i16m1(q0_i16, delta, vl);
+    p0_new_i16 = __riscv_vmax_vx_i16m1(p0_new_i16, 0, vl);
+    q0_new_i16 = __riscv_vmax_vx_i16m1(q0_new_i16, 0, vl);
+
+    *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, 
__riscv_vreinterpret_v_i16m1_u16m1(p0_new_i16), 0, vl);
+    *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, 
__riscv_vreinterpret_v_i16m1_u16m1(q0_new_i16), 0, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_chroma(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                int width, int 
alpha, int beta, int8_t *p_tc0)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+    int tc_offset = 0;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vint8mf2_t tc8;
+        extend_tc0_2(&tc8, p_tc0, tc_offset, vl);
+
+        vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+        vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+        vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+        vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+
+        vuint8mf2_t p0_dst, q0_dst;
+        chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl);
+
+        __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+
+        count -= vl;
+        tc_offset += vl;
+        p_iter = p_iter + vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_chroma(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                int width, int 
alpha, int beta, int8_t *p_tc0)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+    int tc_offset = 0;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vint8mf2_t tc8;
+        extend_tc0_2(&tc8, p_tc0, tc_offset, vl);
+
+        vuint8mf2_t p1, p0, q0, q1;
+        __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+        vuint8mf2_t p0_dst, q0_dst;
+        chroma_core(&p0_dst, &q0_dst, p1, p0, q0, q1, tc8, alpha, beta, vl);
+
+        __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+        count -= vl;
+        tc_offset = tc_offset + vl;
+        p_iter = p_iter + vl * stride;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void luma_intra_core(vuint8mf2_t 
*p_p2_dst, vuint8mf2_t *p_p1_dst,
+                                                           vuint8mf2_t 
*p_p0_dst, vuint8mf2_t *p_q0_dst,
+                                                           vuint8mf2_t 
*p_q1_dst, vuint8mf2_t *p_q2_dst,
+                                                           vuint8mf2_t p3, 
vuint8mf2_t p2, vuint8mf2_t p1,
+                                                           vuint8mf2_t p0, 
vuint8mf2_t q0, vuint8mf2_t q1,
+                                                           vuint8mf2_t q2, 
vuint8mf2_t q3, int alpha,
+                                                           int beta, int vl)
+{
+        vint16m1_t p3_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p3, 0, vl));
+        vint16m1_t p2_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p2, 0, vl));
+        vint16m1_t p1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+        vint16m1_t p0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+        vint16m1_t q0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+        vint16m1_t q1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+        vint16m1_t q2_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q2, 0, vl));
+        vint16m1_t q3_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q3, 0, vl));
+
+        // p0
+        vint16m1_t sum_p1p0q0 = __riscv_vadd_vv_i16m1(p0_i16, p1_i16, vl);
+        sum_p1p0q0 = __riscv_vadd_vv_i16m1(sum_p1p0q0, q0_i16, vl);
+
+        vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl);
+        vint16m1_t p0_new2_i16 = __riscv_vadd_vv_i16m1(p2_i16, q1_i16, vl);
+
+        // p1
+        vint16m1_t p1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p1p0q0, p2_i16, vl);
+
+        // q0
+        vint16m1_t sum_p0q0q1 = __riscv_vadd_vv_i16m1(p0_i16, q0_i16, vl);
+        sum_p0q0q1 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q1_i16, vl);
+
+        vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl);
+        vint16m1_t q0_new2_i16 = __riscv_vadd_vv_i16m1(q2_i16, p1_i16, vl);
+
+        // q1
+        vint16m1_t q1_new1_i16 = __riscv_vadd_vv_i16m1(sum_p0q0q1, q2_i16, vl);
+
+        p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl);
+        p0_new2_i16 = __riscv_vmacc_vx_i16m1(p0_new2_i16, 2, sum_p1p0q0, vl);
+        vint16m1_t p2_new1_i16 = __riscv_vmadd_vx_i16m1(p3_i16, 2, sum_p1p0q0, 
vl);
+        p2_new1_i16 = __riscv_vmacc_vx_i16m1(p2_new1_i16, 3, p2_i16, vl);
+        q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl);
+        q0_new2_i16 = __riscv_vmacc_vx_i16m1(q0_new2_i16, 2, sum_p0q0q1, vl);
+        vint16m1_t q2_new1_i16 = __riscv_vmadd_vx_i16m1(q3_i16, 2, sum_p0q0q1, 
vl);
+        q2_new1_i16 = __riscv_vmacc_vx_i16m1(q2_new1_i16, 3, q2_i16, vl);
+
+        vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+        vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+        vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+        vint16m1_t sub_p2p0 = __riscv_vsub_vv_i16m1(p2_i16, p0_i16, vl);
+        vint16m1_t sub_q2q0 = __riscv_vsub_vv_i16m1(q2_i16, q0_i16, vl);
+
+        vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl);
+        vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl);
+        vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl);
+        vint16m1_t rsub_p2p0 = __riscv_vrsub_vx_i16m1(sub_p2p0, 0, vl);
+        vint16m1_t rsub_q2q0 = __riscv_vrsub_vx_i16m1(sub_q2q0, 0, vl);
+
+        vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(rsub_q0p0, sub_q0p0, vl);
+        vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(rsub_p1p0, sub_p1p0, vl);
+        vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(rsub_q1q0, sub_q1q0, vl);
+        vint16m1_t abd_p2p0 = __riscv_vmax_vv_i16m1(rsub_p2p0, sub_p2p0, vl);
+        vint16m1_t abd_q2q0 = __riscv_vmax_vv_i16m1(rsub_q2q0, sub_q2q0, vl);
+
+        vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl);
+        vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16(abd_p1p0_, beta, vl);
+        vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16(abd_q1q0, beta, vl);
+        vbool16_t cond2 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, (alpha >> 2) + 
2, vl);
+        vbool16_t cond3 = __riscv_vmslt_vx_i16m1_b16(abd_p2p0, beta, vl);
+        vbool16_t cond4 = __riscv_vmslt_vx_i16m1_b16(abd_q2q0, beta, vl);
+
+        vbool16_t cond1 = __riscv_vmand_mm_b16(cond11, cond12, vl);
+        cond1 = __riscv_vmand_mm_b16(cond1, cond13, vl);
+        cond2 = __riscv_vmand_mm_b16(cond2, cond1, vl);
+        cond3 = __riscv_vmand_mm_b16(cond3, cond2, vl);
+        cond4 = __riscv_vmand_mm_b16(cond4, cond2, vl);
+
+        vuint8mf2_t p0_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, 
vl);
+        vuint8mf2_t p0_new2_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p0_new2_i16), 3, 
vl);
+        vuint8mf2_t p1_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p1_new1_i16), 2, 
vl);
+        vuint8mf2_t p2_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(p2_new1_i16), 3, 
vl);
+        vuint8mf2_t q0_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, 
vl);
+        vuint8mf2_t q0_new2_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q0_new2_i16), 3, 
vl);
+        vuint8mf2_t q1_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q1_new1_i16), 2, 
vl);
+        vuint8mf2_t q2_new1_u8 = 
__riscv_vnclipu_wx_u8mf2(__riscv_vreinterpret_v_i16m1_u16m1(q2_new1_i16), 3, 
vl);
+
+        *p_p1_dst = __riscv_vmerge_vvm_u8mf2(p1, p1_new1_u8, cond3, vl);
+        *p_p2_dst = __riscv_vmerge_vvm_u8mf2(p2, p2_new1_u8, cond3, vl);
+        *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0_new1_u8, p0_new2_u8, cond3, 
vl);
+        *p_p0_dst = __riscv_vmerge_vvm_u8mf2(p0, *p_p0_dst, cond1, vl);
+
+        *p_q0_dst = __riscv_vmerge_vvm_u8mf2(q0, q0_new1_u8, cond1, vl);
+        *p_q0_dst = __riscv_vmerge_vvm_u8mf2(*p_q0_dst, q0_new2_u8, cond4, vl);
+        *p_q1_dst = __riscv_vmerge_vvm_u8mf2(q1, q1_new1_u8, cond4, vl);
+        *p_q2_dst = __riscv_vmerge_vvm_u8mf2(q2, q2_new1_u8, cond4, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_luma_intra(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                    int width, 
int alpha, int beta)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vuint8mf2_t p3 = __riscv_vle8_v_u8mf2(p_iter - 4 * stride, vl);
+        vuint8mf2_t p2 = __riscv_vle8_v_u8mf2(p_iter - 3 * stride, vl);
+        vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+        vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+        vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+        vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+        vuint8mf2_t q2 = __riscv_vle8_v_u8mf2(p_iter + 2 * stride, vl);
+        vuint8mf2_t q3 = __riscv_vle8_v_u8mf2(p_iter + 3 * stride, vl);
+
+        vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst;
+
+        luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst,
+                        p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl);
+
+        __riscv_vse8_v_u8mf2(p_iter - stride * 3, p2_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter - stride * 2, p1_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter + stride, q1_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter + stride * 2, q2_dst, vl);
+
+        count -= vl;
+        p_iter = p_iter + vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_luma_intra(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                    int width, 
int alpha, int beta)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vuint8mf2_t p3, p2, p1, p0, q0, q1, q2, q3;
+        __riscv_vlsseg8e8_v_u8mf2(&p3, &p2, &p1, &p0,
+                          &q0, &q1, &q2, &q3, p_iter - 4, stride, 16);
+
+        vuint8mf2_t p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst;
+
+        luma_intra_core(&p2_dst, &p1_dst, &p0_dst, &q0_dst, &q1_dst, &q2_dst,
+                        p3, p2, p1, p0, q0, q1, q2, q3, alpha, beta, vl);
+
+        __riscv_vssseg6e8_v_u8mf2(p_iter - 3, stride,
+                          p2_dst, p1_dst, p0_dst, q0_dst, q1_dst, q2_dst, 16);
+
+        count -= vl;
+        p_iter = p_iter + vl * stride;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void chroma_intra_core(vuint8mf2_t 
*p_p0_dst, vuint8mf2_t *p_q0_dst,
+                                                             vuint8mf2_t p1, 
vuint8mf2_t p0,
+                                                             vuint8mf2_t q0, 
vuint8mf2_t q1,
+                                                             int alpha, int 
beta, int vl)
+{
+    vint16m1_t p1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p1, 0, vl));
+    vint16m1_t p0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(p0, 0, vl));
+    vint16m1_t q0_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q0, 0, vl));
+    vint16m1_t q1_i16 = 
__riscv_vreinterpret_v_u16m1_i16m1(__riscv_vwaddu_vx_u16m1(q1, 0, vl));
+
+    vint16m1_t sub_q0p0 = __riscv_vsub_vv_i16m1(q0_i16, p0_i16, vl);
+    vint16m1_t sub_p1p0 = __riscv_vsub_vv_i16m1(p1_i16, p0_i16, vl);
+    vint16m1_t sub_q1q0 = __riscv_vsub_vv_i16m1(q1_i16, q0_i16, vl);
+
+    vint16m1_t rsub_q0p0 = __riscv_vrsub_vx_i16m1(sub_q0p0, 0, vl);
+    vint16m1_t rsub_p1p0 = __riscv_vrsub_vx_i16m1(sub_p1p0, 0, vl);
+    vint16m1_t rsub_q1q0 = __riscv_vrsub_vx_i16m1(sub_q1q0, 0, vl);
+
+    vint16m1_t abd_q0p0 = __riscv_vmax_vv_i16m1(sub_q0p0, rsub_q0p0, vl);
+    vint16m1_t abd_p1p0_ = __riscv_vmax_vv_i16m1(sub_p1p0, rsub_p1p0, vl);
+    vint16m1_t abd_q1q0 = __riscv_vmax_vv_i16m1(sub_q1q0, rsub_q1q0, vl);
+
+    vbool16_t cond11 = __riscv_vmslt_vx_i16m1_b16(abd_q0p0, alpha, vl);
+    vbool16_t cond12 = __riscv_vmslt_vx_i16m1_b16_mu(cond11, cond11, 
abd_p1p0_, beta, vl);
+    vbool16_t cond13 = __riscv_vmslt_vx_i16m1_b16_mu(cond12, cond12, abd_q1q0, 
beta, vl);
+
+    vint16m1_t p0_new1_i16 = __riscv_vadd_vv_i16m1(p0_i16, q1_i16, vl);
+    vint16m1_t q0_new1_i16 = __riscv_vadd_vv_i16m1(q0_i16, p1_i16, vl);
+    p0_new1_i16 = __riscv_vmacc_vx_i16m1(p0_new1_i16, 2, p1_i16, vl);
+    q0_new1_i16 = __riscv_vmacc_vx_i16m1(q0_new1_i16, 2, q1_i16, vl);
+
+    *p_p0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, p0, 
__riscv_vreinterpret_v_i16m1_u16m1(p0_new1_i16), 2, vl);
+    *p_q0_dst = __riscv_vnclipu_wx_u8mf2_mu(cond13, q0, 
__riscv_vreinterpret_v_i16m1_u16m1(q0_new1_i16), 2, vl);
+}
+
+__attribute__((always_inline)) static void v_loop_filter_chroma_intra(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                      int 
width, int alpha, int beta)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vuint8mf2_t p1 = __riscv_vle8_v_u8mf2(p_iter - 2 * stride, vl);
+        vuint8mf2_t p0 = __riscv_vle8_v_u8mf2(p_iter - stride, vl);
+        vuint8mf2_t q0 = __riscv_vle8_v_u8mf2(p_iter, vl);
+        vuint8mf2_t q1 = __riscv_vle8_v_u8mf2(p_iter + stride, vl);
+
+        vuint8mf2_t p0_dst, q0_dst;
+        chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+        __riscv_vse8_v_u8mf2(p_iter - stride, p0_dst, vl);
+        __riscv_vse8_v_u8mf2(p_iter, q0_dst, vl);
+
+        count -= vl;
+        p_iter = p_iter + vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h_loop_filter_chroma_intra(uint8_t 
*p_pix, ptrdiff_t stride,
+                                                                      int 
width, int alpha, int beta)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(width);
+
+        vuint8mf2_t p1, p0, q0, q1;
+        __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+        vuint8mf2_t p0_dst, q0_dst;
+        chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+        __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+        count -= vl;
+        p_iter = p_iter + vl * stride;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void 
h_loop_filter_chroma_mbaff_intra(uint8_t *p_pix, ptrdiff_t stride,
+                                                                            
int width, int alpha, int beta)
+{
+    uint8_t *p_iter = p_pix;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_TONEARESTUP);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8mf2(count);
+
+        vuint8mf2_t p1, p0, q0, q1;
+        __riscv_vlsseg4e8_v_u8mf2(&p1, &p0, &q0, &q1, p_iter - 2, stride, vl);
+
+        vuint8mf2_t p0_dst, q0_dst;
+        chroma_intra_core(&p0_dst, &q0_dst, p1, p0, q0, q1, alpha, beta, vl);
+
+        __riscv_vssseg2e8_v_u8mf2(p_iter - 1, stride, p0_dst, q0_dst, vl);
+
+        count -= vl;
+        p_iter = p_iter + vl * stride;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, 
int beta, int8_t *p_tc0)
+{
+    v_loop_filter_luma(pix, stride, 16, alpha, beta, p_tc0);
+}
+
+void h264_h_loop_filter_luma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta, int8_t *p_tc0)
+{
+    h_loop_filter_luma(p_pix, stride, 16, alpha, beta, p_tc0);
+}
+
+void h264_v_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta, int8_t *p_tc0)
+{
+    v_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0);
+}
+
+void h264_h_loop_filter_chroma_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta, int8_t *p_tc0)
+{
+    h_loop_filter_chroma(p_pix, stride, 8, alpha, beta, p_tc0);
+}
+
+void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta)
+{
+    v_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta);
+}
+
+void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta)
+{
+    h_loop_filter_luma_intra(p_pix, stride, 16, alpha, beta);
+}
+
+void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, 
int alpha, int beta)
+{
+    v_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta);
+}
+
+void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, 
int alpha, int beta)
+{
+    h_loop_filter_chroma_intra(p_pix, stride, 8, alpha, beta);
+}
+
+void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t 
stride, int alpha, int beta)
+{
+    h_loop_filter_chroma_mbaff_intra(p_pix, stride, 4, alpha, beta);
+}
+#endif
diff --git a/libavcodec/riscv/h264_inloop.h b/libavcodec/riscv/h264_inloop.h
new file mode 100644
index 0000000000..3c60e45395
--- /dev/null
+++ b/libavcodec/riscv/h264_inloop.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_INLOOP_H
+#define AVCODEC_RISCV_H264_INLOOP_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void h264_v_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, 
int beta, int8_t *tc0);
+void h264_h_loop_filter_luma_8_rvv(uint8_t *pix, ptrdiff_t stride, int alpha, 
int beta, int8_t *tc0);
+
+void h264_v_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int 
alpha, int beta, int8_t *tc0);
+void h264_h_loop_filter_chroma_8_rvv(uint8_t *pix, ptrdiff_t stride, int 
alpha, int beta, int8_t *tc0);
+
+void h264_v_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta);
+void h264_h_loop_filter_luma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, int 
alpha, int beta);
+
+void h264_v_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, 
int alpha, int beta);
+void h264_h_loop_filter_chroma_intra_8_rvv(uint8_t *p_pix, ptrdiff_t stride, 
int alpha, int beta);
+
+void h264_h_loop_filter_chroma_mbaff_intra_8_rvv(uint8_t *p_pix, ptrdiff_t 
stride, int alpha, int beta);
+#endif
+#endif
\ No newline at end of file
diff --git a/libavcodec/riscv/h264_weighted_sum.c 
b/libavcodec/riscv/h264_weighted_sum.c
new file mode 100644
index 0000000000..0ba57d0acc
--- /dev/null
+++ b/libavcodec/riscv/h264_weighted_sum.c
@@ -0,0 +1,273 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "h264_weighted_sum.h"
+#if HAVE_INTRINSICS_RVV
+#include <riscv_vector.h>
+
+typedef unsigned char pixel;
+
+__attribute__((always_inline)) static void h264_weight_128(uint8_t *p_block, 
ptrdiff_t stride, int width,
+                                                           int height, int 
log2_den, int offset)
+
+{
+    uint8_t *p_block_iter = p_block;
+
+    const unsigned char weight = 128;
+    short value = (unsigned)offset << log2_den;
+    value += (1 << (log2_den - 1));
+
+    int shift = log2_den;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(count);
+        uint8_t *p_begin = p_block_iter;
+
+        for (int j = 0; j < height; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl);
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl);
+
+            vint16m2_t result0_w, result1_w;
+
+            result0_w = 
__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row0, vl));
+            result1_w = 
__riscv_vreinterpret_v_u16m2_i16m2(__riscv_vzext_vf2_u16m2(row1, vl));
+
+            result0_w = __riscv_vsll_vx_i16m2(result0_w, 7, vl);
+            result1_w = __riscv_vsll_vx_i16m2(result1_w, 7, vl);
+
+            result0_w = __riscv_vadd_vx_i16m2(result0_w, offset, vl);
+            result1_w = __riscv_vadd_vx_i16m2(result1_w, offset, vl);
+
+            result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+            result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+            vuint8m1_t result0_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, 
vl);
+            vuint8m1_t result1_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, 
vl);
+
+            __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl);
+            p_block_iter += stride;
+            __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl);
+            p_block_iter += stride;
+        }
+
+        p_block_iter = p_begin + vl;
+        count -= vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h264_weight_normal(uint8_t 
*p_block, ptrdiff_t stride,
+                                                              int width, int 
height, int log2_den,
+                                                              int weight, int 
offset)
+
+{
+    uint8_t *p_block_iter = p_block;
+
+    short value = (unsigned)offset << log2_den;
+
+    if (log2_den)
+        value += (1 << (log2_den - 1));
+
+    int shift = log2_den;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(count);
+        uint8_t *p_begin = p_block_iter;
+
+        vint8m1_t weight_v = __riscv_vmv_v_x_i8m1(weight, vl);
+
+        for (int j = 0; j < height; j += 2)
+        {
+            vuint8m1_t row0 = __riscv_vle8_v_u8m1(p_block_iter, vl);
+            vuint8m1_t row1 = __riscv_vle8_v_u8m1(p_block_iter + stride, vl);
+
+            vint16m2_t result0_w, result1_w;
+
+            result0_w = __riscv_vwmulsu_vv_i16m2(weight_v, row0, vl);
+            result1_w = __riscv_vwmulsu_vv_i16m2(weight_v, row1, vl);
+
+            result0_w = __riscv_vsadd_vx_i16m2(result0_w, value, vl);
+            result1_w = __riscv_vsadd_vx_i16m2(result1_w, value, vl);
+
+            result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+            result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+            vuint8m1_t result0_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, 
vl);
+            vuint8m1_t result1_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, 
vl);
+
+            __riscv_vse8_v_u8m1(p_block_iter, result0_n, vl);
+            p_block_iter += stride;
+            __riscv_vse8_v_u8m1(p_block_iter, result1_n, vl);
+            p_block_iter += stride;
+        }
+
+        p_block_iter = p_begin + vl;
+        count -= vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+__attribute__((always_inline)) static void h264_biweight(uint8_t *p_dst, 
uint8_t *p_src, ptrdiff_t stride,
+                                                         int width, int 
height, int log2_den,
+                                                         int weightd, int 
weights, int offset)
+{
+    uint8_t *p_dst_iter = p_dst;
+    uint8_t *p_src_iter = p_src;
+    short value = (unsigned int)((offset + 1) | 1) << log2_den;
+    int shift = log2_den + 1;
+
+    size_t vxrm = __builtin_rvv_vgetvxrm();
+    __builtin_rvv_vsetvxrm(VE_DOWNWARD);
+
+    int count = width;
+
+    while (count > 0)
+    {
+        int vl = __riscv_vsetvl_e8m1(count);
+        uint8_t *p_src_begin = p_src_iter;
+        uint8_t *p_dst_begin = p_dst_iter;
+
+        for (int j = 0; j < height; j += 2)
+        {
+            vuint8m1_t src0 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+            vuint8m1_t src1 = __riscv_vle8_v_u8m1(p_src_iter, vl);
+            p_src_iter += stride;
+
+            vuint8m1_t dst0 = __riscv_vle8_v_u8m1(p_dst_iter, vl);
+            vuint8m1_t dst1 = __riscv_vle8_v_u8m1(p_dst_iter + stride, vl);
+
+            vint16m2_t result0_w, result1_w;
+
+            result0_w = __riscv_vmv_v_x_i16m2(value, vl);
+            result1_w = __riscv_vmv_v_x_i16m2(value, vl);
+
+            result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weightd, dst0, 
vl);
+            result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weightd, dst1, 
vl);
+
+            result0_w = __riscv_vwmaccsu_vx_i16m2(result0_w, weights, src0, 
vl);
+            result1_w = __riscv_vwmaccsu_vx_i16m2(result1_w, weights, src1, 
vl);
+
+            result0_w = __riscv_vmax_vx_i16m2(result0_w, 0, vl);
+            result1_w = __riscv_vmax_vx_i16m2(result1_w, 0, vl);
+
+            vuint8m1_t result0_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result0_w), shift, 
vl);
+            vuint8m1_t result1_n = 
__riscv_vnclipu_wx_u8m1(__riscv_vreinterpret_v_i16m2_u16m2(result1_w), shift, 
vl);
+
+            __riscv_vse8_v_u8m1(p_dst_iter, result0_n, vl);
+            p_dst_iter += stride;
+            __riscv_vse8_v_u8m1(p_dst_iter, result1_n, vl);
+            p_dst_iter += stride;
+        }
+
+        p_src_iter = p_src_begin + vl;
+        p_dst_iter = p_dst_begin + vl;
+        count -= vl;
+    }
+
+    __builtin_rvv_vsetvxrm(vxrm);
+}
+
+void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                 int height, int log2_den, int weight, int 
offset)
+{
+    if (weight == 1 && offset == 0 && log2_den == 0)
+        return;
+
+    if (weight == 128)
+    {
+        h264_weight_128(p_block, stride, 16, height, log2_den, offset);
+    }
+    else
+    {
+        h264_weight_normal(p_block, stride, 16, height, log2_den, weight, 
offset);
+    }
+}
+
+void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                int height, int log2_den, int weight, int 
offset)
+{
+    if (weight == 1 && offset == 0 && log2_den == 0)
+        return;
+
+    if (weight == 128)
+    {
+        h264_weight_128(p_block, stride, 8, height, log2_den, offset);
+    }
+    else
+    {
+        h264_weight_normal(p_block, stride, 8, height, log2_den, weight, 
offset);
+    }
+}
+
+void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                int height, int log2_den, int weight, int 
offset)
+{
+    if (weight == 1 && offset == 0 && log2_den == 0)
+        return;
+
+    if (weight == 128)
+    {
+        h264_weight_128(p_block, stride, 4, height, log2_den, offset);
+    }
+    else
+    {
+        h264_weight_normal(p_block, stride, 4, height, log2_den, weight, 
offset);
+    }
+}
+
+void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                   int height, int log2_den, int weightd,
+                                   int weights, int offset)
+{
+    h264_biweight(p_dst, p_src, stride, 16, height, log2_den, weightd, 
weights, offset);
+}
+
+void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                  int height, int log2_den, int weightd,
+                                  int weights, int offset)
+{
+
+    h264_biweight(p_dst, p_src, stride, 8, height, log2_den, weightd, weights, 
offset);
+}
+
+void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                  int height, int log2_den, int weightd,
+                                  int weights, int offset)
+{
+
+    h264_biweight(p_dst, p_src, stride, 4, height, log2_den, weightd, weights, 
offset);
+}
+#endif
diff --git a/libavcodec/riscv/h264_weighted_sum.h 
b/libavcodec/riscv/h264_weighted_sum.h
new file mode 100644
index 0000000000..631d6df1fa
--- /dev/null
+++ b/libavcodec/riscv/h264_weighted_sum.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023 SiFive, Inc. All rights reserved.
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVCODEC_RISCV_H264_WEIGHTED_SUM_H
+#define AVCODEC_RISCV_H264_WEIGHTED_SUM_H
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include "config.h"
+
+#if HAVE_INTRINSICS_RVV
+typedef unsigned char pixel;
+
+void weight_h264_pixels_16_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                 int height, int log2_den, int weight, int 
offset);
+void weight_h264_pixels_8_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                 int height, int log2_den, int weight, int 
offset);
+void weight_h264_pixels_4_8_rvv(uint8_t *p_block, ptrdiff_t stride,
+                                 int height, int log2_den, int weight, int 
offset);
+
+void biweight_h264_pixels_16_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                   int height, int log2_den, int weightd, int 
weights, int offset);
+void biweight_h264_pixels_8_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                  int height, int log2_den, int weightd, int 
weights, int offset);
+void biweight_h264_pixels_4_8_rvv(uint8_t *p_dst, uint8_t *p_src, ptrdiff_t 
stride,
+                                  int height, int log2_den, int weightd, int 
weights, int offset);
+#endif
+#endif
\ No newline at end of file
-- 
2.17.1

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/5] lavc/h264dsp: Add vectorized implementation of DSP functions for RISC-V

Reply via email to