[FFmpeg-cvslog] lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv
ffmpeg | branch: master | Rémi Denis-Courmont | Sun Jun 2 12:03:33 2024 +0300| [121fb846b97db5afd3a24b0bde2be151400104be] | committer: Rémi Denis-Courmont lavc/vp7dsp: add R-V V vp7_idct_dc_add4uv This is almost the same story as vp7_idct_add4y. We just have to use strided loads of 2 64-bit elements to account for the different data layout in memory. T-Head C908: vp7_idct_dc_add4uv_c: 7.5 vp7_idct_dc_add4uv_rvv_i64: 2.0 vp8_idct_dc_add4uv_c: 6.2 vp8_idct_dc_add4uv_rvv_i32: 2.2 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 SpacemiT X60: vp7_idct_dc_add4uv_c: 6.7 vp7_idct_dc_add4uv_rvv_i64: 2.2 vp8_idct_dc_add4uv_c: 5.7 vp8_idct_dc_add4uv_rvv_i32: 2.5 (before) vp8_idct_dc_add4uv_rvv_i64: 2.0 > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=121fb846b97db5afd3a24b0bde2be151400104be --- libavcodec/riscv/vp7dsp_init.c | 3 +++ libavcodec/riscv/vp7dsp_rvv.S | 6 +++-- libavcodec/riscv/vp8dsp_init.c | 3 +-- libavcodec/riscv/vp8dsp_rvv.S | 50 +++--- 4 files changed, 45 insertions(+), 17 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_init.c b/libavcodec/riscv/vp7dsp_init.c index fa5fb9d2ae..9b8357ec05 100644 --- a/libavcodec/riscv/vp7dsp_init.c +++ b/libavcodec/riscv/vp7dsp_init.c @@ -29,6 +29,7 @@ void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]); void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc); void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); +void ff_vp7_idct_dc_add4uv_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t); static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride) @@ -51,6 +52,8 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c) #endif c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv; c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv; +if (flags & AV_CPU_FLAG_RVV_I64) +c->vp8_idct_dc_add4uv = ff_vp7_idct_dc_add4uv_rvv; } #endif } diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index 09dcbf3857..856b0e8c96 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -128,7 +128,8 @@ func ff_vp7_idct_add_rvv, zve32x endfunc #endif -func ff_vp7_idct_dc_add4y_rvv, zve32x +.irp type, y, uv +func ff_vp7_idct_dc_add4\type\()_rvv, zve32x li t0, 32 vsetivli zero, 4, e16, mf2, ta, ma li t1, 23170 @@ -141,5 +142,6 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x vadd.vx v0, v0, t2 vsetvli zero, zero, e16, mf2, ta, ma vnsra.wi v8, v0, 18 # 4x DC -tail ff_vp78_idct_dc_add4y_rvv +tail ff_vp78_idct_dc_add4\type\()_rvv endfunc +.endr diff --git a/libavcodec/riscv/vp8dsp_init.c b/libavcodec/riscv/vp8dsp_init.c index 836237b41c..5911d195ba 100644 --- a/libavcodec/riscv/vp8dsp_init.c +++ b/libavcodec/riscv/vp8dsp_init.c @@ -131,9 +131,8 @@ av_cold void ff_vp8dsp_init_riscv(VP8DSPContext *c) #endif c->vp8_idct_dc_add = ff_vp8_idct_dc_add_rvv; c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_rvv; -if (flags & AV_CPU_FLAG_RVB_ADDR) { +if (flags & AV_CPU_FLAG_RVV_I64) c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_rvv; -} } #endif } diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 458eebb306..c83f9eec71 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -157,6 +157,43 @@ func ff_vp78_idct_dc_add4y_rvv, zve32x ret endfunc +func ff_vp8_idct_dc_add4uv_rvv, zve32x +li t0, 32 +vsetivli zero, 4, e16, mf2, ta, ma +li t1, 4 - (128 << 3) +vlse16.v v8, (a1), t0 +vadd.vx v8, v8, t1 +vsra.vi v8, v8, 3 +# fall through +endfunc + +.variant_cc ff_vp78_idct_dc_add4uv_rvv +func ff_vp78_idct_dc_add4uv_rvv, zve64x +vsetivlizero, 16, e16, m2, ta, ma +vid.v v4 +li a4, 4 +vsrl.vi v4, v4, 2 +li t1, 128 +vrgather.vv v0, v8, v4 # replicate each DC four times +sllit2, a2, 2 +vsetivlizero, 2, e64, m1, ta, ma +1: +vlse64.vv8, (a0), t2 +addia4, a4, -1 +vsetivlizero, 16, e8, m1, ta, ma +vwaddu.wv v16, v0, v8 +sh zero, (a1) +vnclip.wi v8, v16, 0 +addia1, a1, 32 +vxor.vx v8, v8, t1 +vsetivlizero, 2, e64, m1, ta, ma +vsse64.vv8, (a0), t2 +add a0, a0, a2 +bneza4, 1b + +ret +endfunc + .macro vp8_idct_dc_add vlse32.v v0, (a0), a2 lha5, 0(a1) @@ -179,19 +216,6 @@ endfunc addi a1, a1, 32 .endm -func ff_vp8_idct_dc_add4uv_rvv, zve32x -
[FFmpeg-cvslog] lavc/vp8dsp: rework R-V V idct_dc_add4y
ffmpeg | branch: master | Rémi Denis-Courmont | Sun Jun 2 12:13:25 2024 +0300| [225de53c9d446ddf1cc3ece6e99c06c8cce9b78f] | committer: Rémi Denis-Courmont lavc/vp8dsp: rework R-V V idct_dc_add4y DCT-related FFmpeg functions often add an unsigned 8-bit sample to a signed 16-bit coefficient, then clip the result back to an unsigned 8-bit value. RISC-V has no signed 16-bit to unsigned 8-bit clip, so instead our most common sequence is: VWADDU.WV set SEW to 16 bits VMAX.VV zero # clip negative values to 0 set SEW to 8 bits VNCLIPU.WI # clip values over 255 to 255 and narrow Here we use a different sequence which does not require toggling the vector type. This assumes that the wide addend vector is biased by -128: VWADDU.WV VNCLIP.WI# clip values to signed 8-bit and narrow VXOR.VX 0x80 # flip sign bit (convert signed to unsigned) Also the VMAX is effectively replaced by a VXOR of half-width. In this function, this comes for free as we anyway add a constant to the wide vector in the prologue. On C908, this has no observable effects. On X60, this improves microbenchmarks by about 20%. > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=225de53c9d446ddf1cc3ece6e99c06c8cce9b78f --- libavcodec/riscv/vp7dsp_rvv.S | 2 +- libavcodec/riscv/vp8dsp_rvv.S | 14 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libavcodec/riscv/vp7dsp_rvv.S b/libavcodec/riscv/vp7dsp_rvv.S index 39b23c2e79..09dcbf3857 100644 --- a/libavcodec/riscv/vp7dsp_rvv.S +++ b/libavcodec/riscv/vp7dsp_rvv.S @@ -134,7 +134,7 @@ func ff_vp7_idct_dc_add4y_rvv, zve32x li t1, 23170 vlse16.v v8, (a1), t0 # block[0..3][0] vwmul.vx v0, v8, t1 -li t2, 0x2 +li t2, 0x2 - (128 << 18) vsetvli zero, zero, e32, m1, ta, ma vsra.vi v0, v0, 14 vmul.vx v0, v0, t1 diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index 8ea0a0c9bd..458eebb306 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -125,31 +125,31 @@ endfunc func ff_vp8_idct_dc_add4y_rvv, zve32x li t0, 32 vsetivli zero, 4, e16, mf2, ta, ma +li t1, 4 - (128 << 3) vlse16.v v8, (a1), t0 -vadd.vi v8, v8, 4 +vadd.vx v8, v8, t1 vsra.vi v8, v8, 3 # fall through endfunc .variant_cc ff_vp78_idct_dc_add4y_rvv -# v8 = [dc0, dc1, dc2, dc3] +# v8 = [dc0 - 128, dc1 - 128, dc2 - 128, dc3 - 128] func ff_vp78_idct_dc_add4y_rvv, zve32x vsetivlizero, 16, e16, m2, ta, ma vid.v v4 +li a4, 4 vsrl.vi v4, v4, 2 +li t1, 128 vrgather.vv v0, v8, v4 # replicate each DC four times vsetvli zero, zero, e8, m1, ta, ma -li a4, 4 1: vle8.v v8, (a0) addia4, a4, -1 vwaddu.wv v16, v0, v8 sh zero, (a1) -vsetvli zero, zero, e16, m2, ta, ma -vmax.vx v16, v16, zero +vnclip.wi v8, v16, 0 addia1, a1, 32 -vsetvli zero, zero, e8, m1, ta, ma -vnclipu.wi v8, v16, 0 +vxor.vx v8, v8, t1 vse8.v v8, (a0) add a0, a0, a2 bneza4, 1b ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] lavc/vp8dsp: remove no longer used macros
ffmpeg | branch: master | Rémi Denis-Courmont | Sun Jun 2 13:23:24 2024 +0300| [0415bb74c81ab0019e48bd2989ddf48d66918e9e] | committer: Rémi Denis-Courmont lavc/vp8dsp: remove no longer used macros > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=0415bb74c81ab0019e48bd2989ddf48d66918e9e --- libavcodec/riscv/vp8dsp_rvv.S | 22 -- 1 file changed, 22 deletions(-) diff --git a/libavcodec/riscv/vp8dsp_rvv.S b/libavcodec/riscv/vp8dsp_rvv.S index c83f9eec71..82489a7f14 100644 --- a/libavcodec/riscv/vp8dsp_rvv.S +++ b/libavcodec/riscv/vp8dsp_rvv.S @@ -194,28 +194,6 @@ func ff_vp78_idct_dc_add4uv_rvv, zve64x ret endfunc -.macro vp8_idct_dc_add -vlse32.v v0, (a0), a2 -lha5, 0(a1) -shzero, 0(a1) -addi a5, a5, 4 -srai t1, a5, 3 -vsetivli zero, 4*4, e16, m2, ta, ma -vzext.vf2 v2, v0 -vadd.vx v2, v2, t1 -vmax.vx v2, v2, zero -vsetvli zero, zero, e8, m1, ta, ma -vnclipu.wiv0, v2, 0 -vsetivli zero, 4, e8, mf4, ta, ma -vsse32.v v0, (a0), a2 -.endm - -.macro vp8_idct_dc_addy -vp8_idct_dc_add -addi a0, a0, 4 -addi a1, a1, 32 -.endm - .macro bilin_load dst type mn .ifc \type,v add t5, a2, a3 ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] libavcodec/libxvid: code cleanup (replace magic numbers)
ffmpeg | branch: master | Ramiro Polla | Thu May 30 23:20:07 2024 +0200| [01b1f4c9a5d1b7a2421f9bd6d600bcbe2519564a] | committer: Ramiro Polla libavcodec/libxvid: code cleanup (replace magic numbers) > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=01b1f4c9a5d1b7a2421f9bd6d600bcbe2519564a --- libavcodec/libxvid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/libxvid.c b/libavcodec/libxvid.c index b9ac39429d..a490f16b3f 100644 --- a/libavcodec/libxvid.c +++ b/libavcodec/libxvid.c @@ -422,13 +422,13 @@ static av_cold int xvid_encode_init(AVCodecContext *avctx) /* Decide how we should decide blocks */ switch (avctx->mb_decision) { -case 2: +case FF_MB_DECISION_RD: x->vop_flags |= XVID_VOP_MODEDECISION_RD; x->me_flags |= XVID_ME_HALFPELREFINE8_RD| XVID_ME_QUARTERPELREFINE8_RD | XVID_ME_EXTSEARCH_RD | XVID_ME_CHECKPREDICTION_RD; -case 1: +case FF_MB_DECISION_BITS: if (!(x->vop_flags & XVID_VOP_MODEDECISION_RD)) x->vop_flags |= XVID_VOP_FAST_MODEDECISION_RD; x->me_flags |= XVID_ME_HALFPELREFINE16_RD | ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] avcodec/mpegvideo_enc: give magic number a name
ffmpeg | branch: master | Ramiro Polla | Tue Jun 4 15:05:35 2024 +0200| [2d24a80e5e419217f4e9ae22ab85a92a069756e0] | committer: Ramiro Polla avcodec/mpegvideo_enc: give magic number a name > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=2d24a80e5e419217f4e9ae22ab85a92a069756e0 --- libavcodec/mpegvideo_enc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/mpegvideo_enc.c b/libavcodec/mpegvideo_enc.c index 73a9082265..82bab43e14 100644 --- a/libavcodec/mpegvideo_enc.c +++ b/libavcodec/mpegvideo_enc.c @@ -562,7 +562,7 @@ av_cold int ff_mpv_encode_init(AVCodecContext *avctx) if ((s->mpv_flags & FF_MPV_FLAG_QP_RD) && avctx->mb_decision != FF_MB_DECISION_RD) { -av_log(avctx, AV_LOG_ERROR, "QP RD needs mbd=2\n"); +av_log(avctx, AV_LOG_ERROR, "QP RD needs mbd=rd\n"); return AVERROR(EINVAL); } ___ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog To unsubscribe, visit link above, or email ffmpeg-cvslog-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-cvslog] checkasm/sw_rgb: test rgb24/bgr24 to yuv
ffmpeg | branch: master | Zhao Zhili | Thu Jun 6 01:34:45 2024 +0800| [47ba87551c24429a2a838d8da5e3e3e712584173] | committer: James Almer checkasm/sw_rgb: test rgb24/bgr24 to yuv The line width 8 is supposed to test corner case, while the performance doesn't matter. Width 1080 is also a case of unaligned to 16. Width 1920 meant for benchmark (together with --runs options). Signed-off-by: James Almer > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=47ba87551c24429a2a838d8da5e3e3e712584173 --- tests/checkasm/sw_rgb.c | 109 1 file changed, 109 insertions(+) diff --git a/tests/checkasm/sw_rgb.c b/tests/checkasm/sw_rgb.c index 7cd815e5be..b51d0836c3 100644 --- a/tests/checkasm/sw_rgb.c +++ b/tests/checkasm/sw_rgb.c @@ -22,8 +22,11 @@ #include "libavutil/common.h" #include "libavutil/intreadwrite.h" #include "libavutil/mem_internal.h" +#include "libavutil/pixdesc.h" #include "libswscale/rgb2rgb.h" +#include "libswscale/swscale.h" +#include "libswscale/swscale_internal.h" #include "checkasm.h" @@ -179,8 +182,100 @@ static void check_interleave_bytes(void) } } +#define MAX_LINE_SIZE 1920 +static const int input_sizes[] = {8, 128, 1080, MAX_LINE_SIZE}; +static const enum AVPixelFormat rgb_formats[] = { +AV_PIX_FMT_RGB24, +AV_PIX_FMT_BGR24, +}; + +static void check_rgb_to_y(struct SwsContext *ctx) +{ +LOCAL_ALIGNED_32(uint8_t, src, [MAX_LINE_SIZE * 3]); +LOCAL_ALIGNED_32(uint8_t, dst0_y, [MAX_LINE_SIZE * 2]); +LOCAL_ALIGNED_32(uint8_t, dst1_y, [MAX_LINE_SIZE * 2]); + +declare_func(void, uint8_t *dst, const uint8_t *src, + const uint8_t *unused1, const uint8_t *unused2, int width, + uint32_t *rgb2yuv, void *opq); + +randomize_buffers(src, MAX_LINE_SIZE * 3); + +for (int i = 0; i < FF_ARRAY_ELEMS(rgb_formats); i++) { +const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(rgb_formats[i]); + +ctx->srcFormat = rgb_formats[i]; +ff_sws_init_scale(ctx); + +for (int j = 0; j < FF_ARRAY_ELEMS(input_sizes); j++) { +int w = input_sizes[j]; + +if (check_func(ctx->lumToYV12, "%s_to_y_%d", desc->name, w)) { +memset(dst0_y, 0xFA, MAX_LINE_SIZE * 2); +memset(dst1_y, 0xFA, MAX_LINE_SIZE * 2); + +call_ref(dst0_y, src, NULL, NULL, w, ctx->input_rgb2yuv_table, NULL); +call_new(dst1_y, src, NULL, NULL, w, ctx->input_rgb2yuv_table, NULL); + +if (memcmp(dst0_y, dst1_y, w * 2)) +fail(); + +bench_new(dst1_y, src, NULL, NULL, w, ctx->input_rgb2yuv_table, NULL); +} +} +} +} + +static void check_rgb_to_uv(struct SwsContext *ctx) +{ +LOCAL_ALIGNED_32(uint8_t, src, [MAX_LINE_SIZE * 3]); +LOCAL_ALIGNED_32(uint8_t, dst0_u, [MAX_LINE_SIZE * 2]); +LOCAL_ALIGNED_32(uint8_t, dst0_v, [MAX_LINE_SIZE * 2]); +LOCAL_ALIGNED_32(uint8_t, dst1_u, [MAX_LINE_SIZE * 2]); +LOCAL_ALIGNED_32(uint8_t, dst1_v, [MAX_LINE_SIZE * 2]); + +declare_func(void, uint8_t *dstU, uint8_t *dstV, + const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, + int width, uint32_t *pal, void *opq); + +randomize_buffers(src, MAX_LINE_SIZE * 3); + +for (int i = 0; i < 2 * FF_ARRAY_ELEMS(rgb_formats); i++) { +enum AVPixelFormat src_fmt = rgb_formats[i / 2]; +const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(src_fmt); + +ctx->chrSrcHSubSample = (i % 2) ? 0 : 1; +ctx->srcFormat = src_fmt; +ctx->dstFormat = ctx->chrSrcHSubSample ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_YUV444P; +ff_sws_init_scale(ctx); + +for (int j = 0; j < FF_ARRAY_ELEMS(input_sizes); j++) { +int w = input_sizes[j] >> ctx->chrSrcHSubSample; + +if (check_func(ctx->chrToYV12, "%s_to_uv%s_%d", desc->name, + ctx->chrSrcHSubSample ? "_half" : "", + input_sizes[j])) { +memset(dst0_u, 0xFF, MAX_LINE_SIZE * 2); +memset(dst0_v, 0xFF, MAX_LINE_SIZE * 2); +memset(dst1_u, 0xFF, MAX_LINE_SIZE * 2); +memset(dst1_v, 0xFF, MAX_LINE_SIZE * 2); + +call_ref(dst0_u, dst0_v, NULL, src, src, w, ctx->input_rgb2yuv_table, NULL); +call_new(dst1_u, dst1_v, NULL, src, src, w, ctx->input_rgb2yuv_table, NULL); + +if (memcmp(dst0_u, dst1_u, w * 2) || memcmp(dst0_v, dst1_v, w * 2)) +fail(); + +bench_new(dst1_u, dst1_v, NULL, src, src, w, ctx->input_rgb2yuv_table, NULL); +} +} +} +} + void checkasm_check_sw_rgb(void) { +struct SwsContext *ctx; + ff_sws_rgb2rgb_init(); check_shuffle_bytes(shuffle_bytes_2103, "shuffle_bytes_2103"); @@ -203,4 +298,18 @@ void checkasm_check_sw_rgb(void) check_interleave_bytes()