On Sun, Aug 11, 2024 at 10:01 PM Nuo Mi <nuomi2...@gmail.com> wrote:
> > > On Sun, Jul 28, 2024 at 11:19 AM Nuo Mi <nuomi2...@gmail.com> wrote: > >> memset tables in the main thread can become a bottleneck for the decoder. >> For example, if it takes 1% of the processing time for one core, the >> maximum achievable FPS will be 100. >> Move the memeset to worker threads will fix the issue. >> > will apply next week if there are no objections > Done > --- >> libavcodec/vvc/dec.c | 13 ++++- >> libavcodec/vvc/thread.c | 122 ++++++++++++++++++++++++---------------- >> libavcodec/vvc/thread.h | 1 + >> 3 files changed, 85 insertions(+), 51 deletions(-) >> >> diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c >> index 575bcfa33d..d34713296d 100644 >> --- a/libavcodec/vvc/dec.c >> +++ b/libavcodec/vvc/dec.c >> @@ -82,7 +82,13 @@ static int tl_create(TabList *l) >> if (!*t->tab) >> return AVERROR(ENOMEM); >> } >> - } else if (l->zero) { >> + } >> + return 0; >> +} >> + >> +static int tl_zero(TabList *l) >> +{ >> + if (l->zero) { >> for (int i = 0; i < l->nb_tabs; i++) { >> Tab *t = l->tabs + i; >> memset(*t->tab, 0, t->size); >> @@ -404,6 +410,11 @@ static int pic_arrays_init(VVCContext *s, >> VVCFrameContext *fc) >> return 0; >> } >> >> +int ff_vvc_per_frame_init(VVCFrameContext *fc) >> +{ >> + return frame_context_for_each_tl(fc, tl_zero); >> +} >> + >> static int min_positive(const int idx, const int diff, const int >> min_diff) >> { >> return diff > 0 && (idx < 0 || diff < min_diff); >> diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c >> index 28065d726f..74f8e4e9d0 100644 >> --- a/libavcodec/vvc/thread.c >> +++ b/libavcodec/vvc/thread.c >> @@ -40,6 +40,7 @@ typedef struct ProgressListener { >> } ProgressListener; >> >> typedef enum VVCTaskStage { >> + VVC_TASK_STAGE_INIT, // for CTU(0, 0) only >> VVC_TASK_STAGE_PARSE, >> VVC_TASK_STAGE_INTER, >> VVC_TASK_STAGE_RECON, >> @@ -175,10 +176,14 @@ static int task_has_target_score(VVCTask *t, const >> VVCTaskStage stage, const uin >> uint8_t target = 0; >> VVCFrameContext *fc = t->fc; >> >> + if (stage == VVC_TASK_STAGE_INIT) >> + return 1; >> + >> if (stage == VVC_TASK_STAGE_PARSE) { >> - const H266RawSPS *rsps = fc->ps.sps->r; >> - const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && >> !is_first_row(fc, t->rx, t->ry); >> - target = 2 + wpp - 1; //left parse + >> colocation + wpp - no previous stage >> + const H266RawSPS *rsps = fc->ps.sps->r; >> + const int wpp = >> rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, >> t->ry); >> + const int no_prev_stage = t->rs > 0; >> + target = 2 + wpp - no_prev_stage; >> //left parse + colocation + wpp - no_prev_stage >> } else if (stage == VVC_TASK_STAGE_INTER) { >> target = atomic_load(&t->target_inter_score); >> } else { >> @@ -399,6 +404,55 @@ static int task_priority_higher(const AVTask *_a, >> const AVTask *_b) >> return a->ry < b->ry; >> } >> >> +static void check_colocation(VVCContext *s, VVCTask *t) >> +{ >> + const VVCFrameContext *fc = t->fc; >> + >> + if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || >> fc->ps.sps->r->sps_sbtmvp_enabled_flag) { >> + VVCFrame *col = fc->ref->collocated_ref; >> + const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]; >> + if (col && first_col) { >> + //we depend on bottom and right boundary, do not - 1 for y >> + const int y = (t->ry << fc->ps.sps->ctb_log2_size_y); >> + add_progress_listener(col, &t->col_listener, t, s, >> VVC_PROGRESS_MV, y); >> + return; >> + } >> + } >> + frame_thread_add_score(s, fc->ft, t->rx, t->ry, >> VVC_TASK_STAGE_PARSE); >> +} >> + >> +static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, >> SliceContext *sc, EntryPoint *ep) >> +{ >> + const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start]; >> + VVCTask *t = ft->tasks + rs; >> + >> + frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); >> +} >> + >> +static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t) >> +{ >> + VVCFrameContext *fc = lc->fc; >> + VVCFrameThread *ft = fc->ft; >> + const int ret = ff_vvc_per_frame_init(fc); >> + >> + if (ret < 0) >> + return ret; >> + >> + for (int i = 0; i < fc->nb_slices; i++) { >> + SliceContext *sc = fc->slices[i]; >> + for (int j = 0; j < sc->nb_eps; j++) { >> + EntryPoint *ep = sc->eps + j; >> + for (int k = ep->ctu_start; k < ep->ctu_end; k++) { >> + const int rs = sc->sh.ctb_addr_in_curr_slice[k]; >> + VVCTask *t = ft->tasks + rs; >> + check_colocation(s, t); >> + } >> + submit_entry_point(s, ft, sc, ep); >> + } >> + } >> + return 0; >> +} >> + >> static void report_frame_progress(VVCFrameContext *fc, >> const int ry, const VVCProgress idx) >> { >> @@ -547,6 +601,7 @@ static int run_alf(VVCContext *s, VVCLocalContext >> *lc, VVCTask *t) >> #define VVC_THREAD_DEBUG >> #ifdef VVC_THREAD_DEBUG >> const static char* task_name[] = { >> + "INIT", >> "P", >> "I", >> "R", >> @@ -567,6 +622,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, >> VVCLocalContext *lc) >> VVCFrameThread *ft = fc->ft; >> const VVCTaskStage stage = t->stage; >> static const run_func run[] = { >> + run_init, >> run_parse, >> run_inter, >> run_recon, >> @@ -726,7 +782,7 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc) >> >> for (int rs = 0; rs < ft->ctu_count; rs++) { >> VVCTask *t = ft->tasks + rs; >> - task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs / >> ft->ctu_width); >> + task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT, >> fc, rs % ft->ctu_width, rs / ft->ctu_width); >> } >> >> memset(&ft->row_progress[0], 0, sizeof(ft->row_progress)); >> @@ -745,59 +801,25 @@ fail: >> return AVERROR(ENOMEM); >> } >> >> -static void check_colocation(VVCContext *s, VVCTask *t) >> -{ >> - const VVCFrameContext *fc = t->fc; >> - >> - if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || >> fc->ps.sps->r->sps_sbtmvp_enabled_flag) { >> - VVCFrame *col = fc->ref->collocated_ref; >> - const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx]; >> - if (col && first_col) { >> - //we depend on bottom and right boundary, do not - 1 for y >> - const int y = (t->ry << fc->ps.sps->ctb_log2_size_y); >> - add_progress_listener(col, &t->col_listener, t, s, >> VVC_PROGRESS_MV, y); >> - return; >> - } >> - } >> - frame_thread_add_score(s, fc->ft, t->rx, t->ry, >> VVC_TASK_STAGE_PARSE); >> -} >> - >> -static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, >> SliceContext *sc, EntryPoint *ep) >> -{ >> - const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start]; >> - VVCTask *t = ft->tasks + rs; >> - >> - frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE); >> -} >> - >> int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc) >> { >> VVCFrameThread *ft = fc->ft; >> >> - // We'll handle this in two passes: >> - // Pass 0 to initialize tasks with parser, this will help detect bit >> stream error >> - // Pass 1 to shedule location check and submit the entry point >> - for (int pass = 0; pass < 2; pass++) { >> - for (int i = 0; i < fc->nb_slices; i++) { >> - SliceContext *sc = fc->slices[i]; >> - for (int j = 0; j < sc->nb_eps; j++) { >> - EntryPoint *ep = sc->eps + j; >> - for (int k = ep->ctu_start; k < ep->ctu_end; k++) { >> - const int rs = sc->sh.ctb_addr_in_curr_slice[k]; >> - VVCTask *t = ft->tasks + rs; >> - if (pass) { >> - check_colocation(s, t); >> - } else { >> - const int ret = task_init_parse(t, sc, ep, k); >> - if (ret < 0) >> - return ret; >> - } >> - } >> - if (pass) >> - submit_entry_point(s, ft, sc, ep); >> + for (int i = 0; i < fc->nb_slices; i++) { >> + SliceContext *sc = fc->slices[i]; >> + for (int j = 0; j < sc->nb_eps; j++) { >> + EntryPoint *ep = sc->eps + j; >> + for (int k = ep->ctu_start; k < ep->ctu_end; k++) { >> + const int rs = sc->sh.ctb_addr_in_curr_slice[k]; >> + VVCTask *t = ft->tasks + rs; >> + const int ret = task_init_parse(t, sc, ep, k); >> + if (ret < 0) >> + return ret; >> } >> } >> } >> + frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT); >> + >> return 0; >> } >> >> diff --git a/libavcodec/vvc/thread.h b/libavcodec/vvc/thread.h >> index 8ac59b2ecf..7b15dbee59 100644 >> --- a/libavcodec/vvc/thread.h >> +++ b/libavcodec/vvc/thread.h >> @@ -32,5 +32,6 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc); >> void ff_vvc_frame_thread_free(VVCFrameContext *fc); >> int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc); >> int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc); >> +int ff_vvc_per_frame_init(VVCFrameContext *fc); >> >> #endif // AVCODEC_VVC_THREAD_H >> -- >> 2.34.1 >> >> _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".