On Fri, May 31, 2019 at 1:44 PM Pavel Koshevoy <pkoshe...@gmail.com> wrote: > > > > > On Fri, May 31, 2019 at 4:46 AM Paul B Mahol <one...@gmail.com> wrote: > > > > Signed-off-by: Paul B Mahol <one...@gmail.com> > > --- > > libavfilter/vf_zscale.c | 335 +++++++++++++++++++++++++--------------- > > 1 file changed, 214 insertions(+), 121 deletions(-) > > > > diff --git a/libavfilter/vf_zscale.c b/libavfilter/vf_zscale.c > > index f0309272fa..c53bb08ccc 100644 > > --- a/libavfilter/vf_zscale.c > > +++ b/libavfilter/vf_zscale.c > > @@ -74,6 +74,16 @@ enum var_name { > > VARS_NB > > }; > > > > +typedef struct ZScaleThreadContext { > > + void *tmp; > > + size_t tmp_size; > > + > > + zimg_image_format src_format, dst_format; > > + zimg_image_format alpha_src_format, alpha_dst_format; > > + zimg_graph_builder_params alpha_params, params; > > + zimg_filter_graph *alpha_graph, *graph; > > +} ZScaleThreadContext; > > + > > typedef struct ZScaleContext { > > const AVClass *class; > > > > @@ -100,6 +110,8 @@ typedef struct ZScaleContext { > > double nominal_peak_luminance; > > int approximate_gamma; > > > > + int nb_threads; > > + > > char *w_expr; ///< width expression string > > char *h_expr; ///< height expression string > > > > @@ -110,13 +122,7 @@ typedef struct ZScaleContext { > > > > int force_original_aspect_ratio; > > > > - void *tmp; > > - size_t tmp_size; > > - > > - zimg_image_format src_format, dst_format; > > - zimg_image_format alpha_src_format, alpha_dst_format; > > - zimg_graph_builder_params alpha_params, params; > > - zimg_filter_graph *alpha_graph, *graph; > > + ZScaleThreadContext *ztd; > > > > enum AVColorSpace in_colorspace, out_colorspace; > > enum AVColorTransferCharacteristic in_trc, out_trc; > > @@ -204,6 +210,12 @@ static int config_props(AVFilterLink *outlink) > > int ret; > > int factor_w, factor_h; > > > > + s->nb_threads = ff_filter_get_nb_threads(ctx); > > + av_freep(&s->ztd); > > + s->ztd = av_calloc(s->nb_threads, sizeof(*s->ztd)); > > + if (!s->ztd) > > + return AVERROR(ENOMEM); > > + > > var_values[VAR_IN_W] = var_values[VAR_IW] = inlink->w; > > var_values[VAR_IN_H] = var_values[VAR_IH] = inlink->h; > > var_values[VAR_OUT_W] = var_values[VAR_OW] = NAN; > > @@ -458,10 +470,12 @@ static int convert_range(enum AVColorRange > > color_range) > > } > > > > static void format_init(zimg_image_format *format, AVFrame *frame, const > > AVPixFmtDescriptor *desc, > > - int colorspace, int primaries, int transfer, int > > range, int location) > > + int colorspace, int primaries, int transfer, int > > range, int location, > > + int width, int height, > > + double active_top, double active_height, int > > set_active) > > { > > - format->width = frame->width; > > - format->height = frame->height; > > + format->width = width; > > + format->height = height; > > format->subsample_w = desc->log2_chroma_w; > > format->subsample_h = desc->log2_chroma_h; > > format->depth = desc->comp[0].depth; > > @@ -472,6 +486,10 @@ static void format_init(zimg_image_format *format, > > AVFrame *frame, const AVPixFm > > format->transfer_characteristics = transfer == - 1 ? > > convert_trc(frame->color_trc) : transfer; > > format->pixel_range = (desc->flags & AV_PIX_FMT_FLAG_RGB) ? > > ZIMG_RANGE_FULL : range == -1 ? convert_range(frame->color_range) : range; > > format->chroma_location = location == -1 ? > > convert_chroma_location(frame->chroma_location) : location; > > + if (!set_active) > > + return; > > + format->active_region.top = active_top; > > + format->active_region.height = active_height; > > } > > > > static int graph_build(zimg_filter_graph **graph, > > zimg_graph_builder_params *params, > > @@ -502,16 +520,163 @@ static int graph_build(zimg_filter_graph **graph, > > zimg_graph_builder_params *par > > return 0; > > } > > > > +typedef struct ThreadData { > > + AVFrame *in, *out; > > + const AVPixFmtDescriptor *desc, *odesc; > > +} ThreadData; > > + > > +static int prepare_graph(AVFilterContext *ctx, void *arg, int jobnr, int > > nb_jobs) > > +{ > > + ZScaleContext *s = ctx->priv; > > + ThreadData *td = arg; > > + AVFrame *in = td->in; > > + AVFrame *out = td->out; > > + const AVPixFmtDescriptor *desc = td->desc; > > + const AVPixFmtDescriptor *odesc = td->odesc; > > + const int in_slice_start = (in->height * jobnr) / nb_jobs; > > + const int in_slice_end = (in->height * (jobnr+1)) / nb_jobs; > > + const int out_slice_start = (out->height * jobnr) / nb_jobs; > > + const int out_slice_end = (out->height * (jobnr+1)) / nb_jobs; > > + const double scale_h = (double)in->height / (double)out->height; > > + double active_top = out_slice_start * scale_h; > > + double active_height = (out_slice_end - out_slice_start) * scale_h; > > + int ret; > > + > > + zimg_image_format_default(&s->ztd[jobnr].src_format, ZIMG_API_VERSION); > > + zimg_image_format_default(&s->ztd[jobnr].dst_format, ZIMG_API_VERSION); > > + zimg_graph_builder_params_default(&s->ztd[jobnr].params, > > ZIMG_API_VERSION); > > + > > + s->ztd[jobnr].params.dither_type = s->dither; > > + s->ztd[jobnr].params.cpu_type = ZIMG_CPU_AUTO; > > + s->ztd[jobnr].params.resample_filter = s->filter; > > + s->ztd[jobnr].params.resample_filter_uv = s->filter; > > + s->ztd[jobnr].params.nominal_peak_luminance = > > s->nominal_peak_luminance; > > + s->ztd[jobnr].params.allow_approximate_gamma = s->approximate_gamma; > > + > > + format_init(&s->ztd[jobnr].src_format, in, desc, s->colorspace_in, > > + s->primaries_in, s->trc_in, s->range_in, s->chromal_in, > > + in->width, in->height, > > + active_top, active_height, 1); > > + format_init(&s->ztd[jobnr].dst_format, out, odesc, s->colorspace, > > + s->primaries, s->trc, s->range, s->chromal, > > + out->width, out_slice_end - out_slice_start, > > + 0, 0, 0); > > + > > + ret = graph_build(&s->ztd[jobnr].graph, &s->ztd[jobnr].params, > > &s->ztd[jobnr].src_format, &s->ztd[jobnr].dst_format, > > + &s->ztd[jobnr].tmp, &s->ztd[jobnr].tmp_size); > > + if (ret) > > + return ret; > > + > > + if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & > > AV_PIX_FMT_FLAG_ALPHA) { > > + zimg_image_format_default(&s->ztd[jobnr].alpha_src_format, > > ZIMG_API_VERSION); > > + zimg_image_format_default(&s->ztd[jobnr].alpha_dst_format, > > ZIMG_API_VERSION); > > + zimg_graph_builder_params_default(&s->ztd[jobnr].alpha_params, > > ZIMG_API_VERSION); > > + > > + s->ztd[jobnr].alpha_params.dither_type = s->dither; > > + s->ztd[jobnr].alpha_params.cpu_type = ZIMG_CPU_AUTO; > > + s->ztd[jobnr].alpha_params.resample_filter = s->filter; > > + > > + s->ztd[jobnr].alpha_src_format.width = in->width; > > + s->ztd[jobnr].alpha_src_format.height = in->height; > > + s->ztd[jobnr].alpha_src_format.depth = desc->comp[0].depth; > > + s->ztd[jobnr].alpha_src_format.pixel_type = (desc->flags & > > AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? > > ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE; > > + s->ztd[jobnr].alpha_src_format.color_family = ZIMG_COLOR_GREY; > > + s->ztd[jobnr].alpha_src_format.active_region.left = 0; > > + s->ztd[jobnr].alpha_src_format.active_region.top = in_slice_start; > > + s->ztd[jobnr].alpha_src_format.active_region.width = in->width; > > + s->ztd[jobnr].alpha_src_format.active_region.height = in_slice_end > > - in_slice_start; > > + > > + s->ztd[jobnr].alpha_dst_format.width = out->width; > > + s->ztd[jobnr].alpha_dst_format.height = out->height; > > + s->ztd[jobnr].alpha_dst_format.depth = odesc->comp[0].depth; > > + s->ztd[jobnr].alpha_dst_format.pixel_type = (odesc->flags & > > AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? > > ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE; > > + s->ztd[jobnr].alpha_dst_format.color_family = ZIMG_COLOR_GREY; > > + > > + zimg_filter_graph_free(s->ztd[jobnr].alpha_graph); > > + s->ztd[jobnr].alpha_graph = > > zimg_filter_graph_build(&s->ztd[jobnr].alpha_src_format, > > &s->ztd[jobnr].alpha_dst_format, &s->ztd[jobnr].alpha_params); > > + if (!s->ztd[jobnr].alpha_graph) { > > + return print_zimg_error(ctx); > > + } > > + } > > + > > + return 0; > > +} > > + > > +static int zscale_slice(AVFilterContext *ctx, void *arg, int jobnr, int > > nb_jobs) > > +{ > > + ZScaleContext *s = ctx->priv; > > + ThreadData *td = arg; > > + AVFrame *in = td->in; > > + AVFrame *out = td->out; > > + const AVPixFmtDescriptor *desc = td->desc; > > + const AVPixFmtDescriptor *odesc = td->odesc; > > + zimg_image_buffer_const src_buf = { ZIMG_API_VERSION }; > > + zimg_image_buffer dst_buf = { ZIMG_API_VERSION }; > > + int ret = AVERROR(EINVAL); > > + > > + for (int plane = 0; plane < 3; plane++) { > > + const int height = plane > 0 ? AV_CEIL_RSHIFT(out->height, > > odesc->log2_chroma_h) : out->height; > > + const int out_slice_start = (height * jobnr) / nb_jobs; > > + int p = desc->comp[plane].plane; > > + > > + src_buf.plane[plane].data = in->data[p]; > > + src_buf.plane[plane].stride = in->linesize[p]; > > + src_buf.plane[plane].mask = -1; > > + > > + p = odesc->comp[plane].plane; > > + dst_buf.plane[plane].data = out->data[p] + out_slice_start * > > out->linesize[p]; > > + dst_buf.plane[plane].stride = out->linesize[p]; > > + dst_buf.plane[plane].mask = -1; > > + } > > + > > + if (s->ztd[jobnr].graph) > > + ret = zimg_filter_graph_process(s->ztd[jobnr].graph, &src_buf, > > &dst_buf, s->ztd[jobnr].tmp, 0, 0, 0, 0); > > + if (ret) > > + return print_zimg_error(ctx); > > + > > + if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & > > AV_PIX_FMT_FLAG_ALPHA) { > > + const int out_slice_start = (out->height * jobnr) / nb_jobs; > > + > > + src_buf.plane[0].data = in->data[3]; > > + src_buf.plane[0].stride = in->linesize[3]; > > + src_buf.plane[0].mask = -1; > > + > > + dst_buf.plane[0].data = out->data[3] + out_slice_start * > > out->linesize[3]; > > + dst_buf.plane[0].stride = out->linesize[3]; > > + dst_buf.plane[0].mask = -1; > > + > > + ret = zimg_filter_graph_process(s->ztd[jobnr].alpha_graph, > > &src_buf, &dst_buf, s->ztd[jobnr].tmp, 0, 0, 0, 0); > > + if (ret) > > + return print_zimg_error(ctx); > > + } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) { > > + int x, y; > > + > > + if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) { > > + for (y = 0; y < out->height; y++) { > > + for (x = 0; x < out->width; x++) { > > + AV_WN32(out->data[3] + x * odesc->comp[3].step + y * > > out->linesize[3], > > + av_float2int(1.0f)); > > + } > > + } > > + } else { > > + for (y = 0; y < out->height; y++) > > + memset(out->data[3] + y * out->linesize[3], 0xff, > > out->width); > > + } > > + } > > + > > + return 0; > > +} > > + > > static int filter_frame(AVFilterLink *link, AVFrame *in) > > { > > - ZScaleContext *s = link->dst->priv; > > - AVFilterLink *outlink = link->dst->outputs[0]; > > + AVFilterContext *ctx = link->dst; > > + ZScaleContext *s = ctx->priv; > > + AVFilterLink *outlink = ctx->outputs[0]; > > const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(link->format); > > const AVPixFmtDescriptor *odesc = av_pix_fmt_desc_get(outlink->format); > > - zimg_image_buffer_const src_buf = { ZIMG_API_VERSION }; > > - zimg_image_buffer dst_buf = { ZIMG_API_VERSION }; > > char buf[32]; > > - int ret = 0, plane; > > + int ret = 0; > > + ThreadData td; > > AVFrame *out; > > > > out = ff_get_video_buffer(outlink, outlink->w, outlink->h); > > @@ -552,41 +717,28 @@ static int filter_frame(AVFilterLink *link, AVFrame > > *in) > > return ret; > > } > > > > - zimg_image_format_default(&s->src_format, ZIMG_API_VERSION); > > - zimg_image_format_default(&s->dst_format, ZIMG_API_VERSION); > > - zimg_graph_builder_params_default(&s->params, ZIMG_API_VERSION); > > - > > - s->params.dither_type = s->dither; > > - s->params.cpu_type = ZIMG_CPU_AUTO; > > - s->params.resample_filter = s->filter; > > - s->params.resample_filter_uv = s->filter; > > - s->params.nominal_peak_luminance = s->nominal_peak_luminance; > > - s->params.allow_approximate_gamma = s->approximate_gamma; > > - > > - format_init(&s->src_format, in, desc, s->colorspace_in, > > - s->primaries_in, s->trc_in, s->range_in, > > s->chromal_in); > > - format_init(&s->dst_format, out, odesc, s->colorspace, > > - s->primaries, s->trc, s->range, s->chromal); > > + td.out = out; > > + td.in = in; > > + td.desc = desc; > > + td.odesc = odesc; > > + ret = ctx->internal->execute(ctx, prepare_graph, &td, NULL, > > FFMIN3(in->height, out->height, s->nb_threads)); > > + if (ret) > > + goto fail; > > > > if (s->colorspace != -1) > > - out->colorspace = (int)s->dst_format.matrix_coefficients; > > + out->colorspace = > > (int)s->ztd[0].dst_format.matrix_coefficients; > > > > if (s->primaries != -1) > > - out->color_primaries = (int)s->dst_format.color_primaries; > > + out->color_primaries = > > (int)s->ztd[0].dst_format.color_primaries; > > > > if (s->range != -1) > > - out->color_range = (int)s->dst_format.pixel_range + 1; > > + out->color_range = (int)s->ztd[0].dst_format.pixel_range + 1; > > > > if (s->trc != -1) > > - out->color_trc = (int)s->dst_format.transfer_characteristics; > > + out->color_trc = > > (int)s->ztd[0].dst_format.transfer_characteristics; > > > > if (s->chromal != -1) > > - out->chroma_location = (int)s->dst_format.chroma_location - 1; > > - > > - ret = graph_build(&s->graph, &s->params, &s->src_format, > > &s->dst_format, > > - &s->tmp, &s->tmp_size); > > - if (ret < 0) > > - goto fail; > > + out->chroma_location = > > (int)s->ztd[0].dst_format.chroma_location - 1; > > > > s->in_colorspace = in->colorspace; > > s->in_trc = in->color_trc; > > @@ -596,101 +748,38 @@ static int filter_frame(AVFilterLink *link, AVFrame > > *in) > > s->out_trc = out->color_trc; > > s->out_primaries = out->color_primaries; > > s->out_range = out->color_range; > > - > > - if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & > > AV_PIX_FMT_FLAG_ALPHA) { > > - zimg_image_format_default(&s->alpha_src_format, > > ZIMG_API_VERSION); > > - zimg_image_format_default(&s->alpha_dst_format, > > ZIMG_API_VERSION); > > - zimg_graph_builder_params_default(&s->alpha_params, > > ZIMG_API_VERSION); > > - > > - s->alpha_params.dither_type = s->dither; > > - s->alpha_params.cpu_type = ZIMG_CPU_AUTO; > > - s->alpha_params.resample_filter = s->filter; > > - > > - s->alpha_src_format.width = in->width; > > - s->alpha_src_format.height = in->height; > > - s->alpha_src_format.depth = desc->comp[0].depth; > > - s->alpha_src_format.pixel_type = (desc->flags & > > AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : desc->comp[0].depth > 8 ? > > ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE; > > - s->alpha_src_format.color_family = ZIMG_COLOR_GREY; > > - > > - s->alpha_dst_format.width = out->width; > > - s->alpha_dst_format.height = out->height; > > - s->alpha_dst_format.depth = odesc->comp[0].depth; > > - s->alpha_dst_format.pixel_type = (odesc->flags & > > AV_PIX_FMT_FLAG_FLOAT) ? ZIMG_PIXEL_FLOAT : odesc->comp[0].depth > 8 ? > > ZIMG_PIXEL_WORD : ZIMG_PIXEL_BYTE; > > - s->alpha_dst_format.color_family = ZIMG_COLOR_GREY; > > - > > - zimg_filter_graph_free(s->alpha_graph); > > - s->alpha_graph = zimg_filter_graph_build(&s->alpha_src_format, > > &s->alpha_dst_format, &s->alpha_params); > > - if (!s->alpha_graph) { > > - ret = print_zimg_error(link->dst); > > - goto fail; > > - } > > - } > > } > > > > if (s->colorspace != -1) > > - out->colorspace = (int)s->dst_format.matrix_coefficients; > > + out->colorspace = (int)s->ztd[0].dst_format.matrix_coefficients; > > > > if (s->primaries != -1) > > - out->color_primaries = (int)s->dst_format.color_primaries; > > + out->color_primaries = (int)s->ztd[0].dst_format.color_primaries; > > > > if (s->range != -1) > > - out->color_range = (int)s->dst_format.pixel_range; > > + out->color_range = (int)s->ztd[0].dst_format.pixel_range; > > > > if (s->trc != -1) > > - out->color_trc = (int)s->dst_format.transfer_characteristics; > > + out->color_trc = > > (int)s->ztd[0].dst_format.transfer_characteristics; > > + > > + if (s->chromal != -1) > > + out->chroma_location = (int)s->ztd[0].dst_format.chroma_location - > > 1; > > > > av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den, > > (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w, > > (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h, > > INT_MAX); > > > > - for (plane = 0; plane < 3; plane++) { > > - int p = desc->comp[plane].plane; > > - src_buf.plane[plane].data = in->data[p]; > > - src_buf.plane[plane].stride = in->linesize[p]; > > - src_buf.plane[plane].mask = -1; > > - > > - p = odesc->comp[plane].plane; > > - dst_buf.plane[plane].data = out->data[p]; > > - dst_buf.plane[plane].stride = out->linesize[p]; > > - dst_buf.plane[plane].mask = -1; > > - } > > - > > - ret = zimg_filter_graph_process(s->graph, &src_buf, &dst_buf, s->tmp, > > 0, 0, 0, 0); > > - if (ret) { > > - ret = print_zimg_error(link->dst); > > + if (!s->ztd[0].graph) { > > + ret = AVERROR(EINVAL); > > goto fail; > > } > > > > - if (desc->flags & AV_PIX_FMT_FLAG_ALPHA && odesc->flags & > > AV_PIX_FMT_FLAG_ALPHA) { > > - src_buf.plane[0].data = in->data[3]; > > - src_buf.plane[0].stride = in->linesize[3]; > > - src_buf.plane[0].mask = -1; > > - > > - dst_buf.plane[0].data = out->data[3]; > > - dst_buf.plane[0].stride = out->linesize[3]; > > - dst_buf.plane[0].mask = -1; > > - > > - ret = zimg_filter_graph_process(s->alpha_graph, &src_buf, > > &dst_buf, s->tmp, 0, 0, 0, 0); > > - if (ret) { > > - ret = print_zimg_error(link->dst); > > - goto fail; > > - } > > - } else if (odesc->flags & AV_PIX_FMT_FLAG_ALPHA) { > > - int x, y; > > - > > - if (odesc->flags & AV_PIX_FMT_FLAG_FLOAT) { > > - for (y = 0; y < out->height; y++) { > > - for (x = 0; x < out->width; x++) { > > - AV_WN32(out->data[3] + x * odesc->comp[3].step + y * > > out->linesize[3], > > - av_float2int(1.0f)); > > - } > > - } > > - } else { > > - for (y = 0; y < outlink->h; y++) > > - memset(out->data[3] + y * out->linesize[3], 0xff, > > outlink->w); > > - } > > - } > > + td.out = out; > > + td.in = in; > > + td.desc = desc; > > + td.odesc = odesc; > > + ret = ctx->internal->execute(ctx, zscale_slice, &td, NULL, > > FFMIN3(in->height, out->height, s->nb_threads)); > > > > fail: > > av_frame_free(&in); > > @@ -706,10 +795,13 @@ static void uninit(AVFilterContext *ctx) > > { > > ZScaleContext *s = ctx->priv; > > > > - zimg_filter_graph_free(s->graph); > > - zimg_filter_graph_free(s->alpha_graph); > > - av_freep(&s->tmp); > > - s->tmp_size = 0; > > + for (int i = 0; i < s->nb_threads; i++) { > > + zimg_filter_graph_free(s->ztd[i].graph); > > + zimg_filter_graph_free(s->ztd[i].alpha_graph); > > + av_freep(&s->ztd[i].tmp); > > + s->ztd[i].tmp_size = 0; > > + } > > + av_freep(&s->ztd); > > } > > > > static int process_command(AVFilterContext *ctx, const char *cmd, const > > char *args, > > @@ -890,4 +982,5 @@ AVFilter ff_vf_zscale = { > > .inputs = avfilter_vf_zscale_inputs, > > .outputs = avfilter_vf_zscale_outputs, > > .process_command = process_command, > > + .flags = AVFILTER_FLAG_SLICE_THREADS, > > }; > > -- > > 2.17.1 > > > I've had to use zscale to convert 10-bit 4k60p video from HLG HDR to SDR > (bt709). It was ~36x times slower than real time. What I ended up doing to > speed it up was to generate CLUT image (16-bit yuv444 65x65x65 sampling of > input color space), lay it out as a 2D image (512x537), and run it through > zscale to generate the HDR->SDR transform CLUT. Then I used the CLUT instead > of zscale for every frame... that got me to about ~3.5x times slower than > realtime converting 60fps 10-bit 4k HLG to SDR (and I don't know any > assembly, so I didn't attempt to optimize the CLUT trilinear optimization > with SIMD, so maybe it could be faster still). I then ported to CUDA and was > able to convert 4k60p HLG->SDR faster than realtime on a Pascal GPU. >
I meant trilinear interpolation > So, I'm not sure that adding slice threading to zscale is the best > optimization for it. I think capturing the effect of zscale in a CLUT would > be a more significant optimization. > > Just my 2 cents, hope this helps. > > Pavel. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".