On 1/18/2018 6:06 PM, Marton Balint wrote: > Blend function speedups on x86_64 Core i5 4460: > > ffmpeg -f lavfi -i allyuv -vf framerate=60:threads=1 -f null none > > C: 447548411 decicycles in Blend, 2048 runs, 0 skips > SSSE3: 130020087 decicycles in Blend, 2048 runs, 0 skips > AVX2: 128508221 decicycles in Blend, 2048 runs, 0 skips > > ffmpeg -f lavfi -i allyuv -vf format=yuv420p12,framerate=60:threads=1 -f null > none > > C: 228932745 decicycles in Blend, 2048 runs, 0 skips > SSE4: 123357781 decicycles in Blend, 2048 runs, 0 skips > AVX2: 121215353 decicycles in Blend, 2048 runs, 0 skips > > Signed-off-by: Marton Balint <c...@passwd.hu> > --- > libavfilter/vf_framerate.c | 24 ++++++- > libavfilter/x86/Makefile | 1 + > libavfilter/x86/vf_framerate.asm | 136 > +++++++++++++++++++++++++++++++++++++++ > 3 files changed, 158 insertions(+), 3 deletions(-) > create mode 100644 libavfilter/x86/vf_framerate.asm > > diff --git a/libavfilter/vf_framerate.c b/libavfilter/vf_framerate.c > index d315ef5d09..6a3b85910f 100644 > --- a/libavfilter/vf_framerate.c > +++ b/libavfilter/vf_framerate.c > @@ -29,11 +29,13 @@ > #define DEBUG > > #include "libavutil/avassert.h" > +#include "libavutil/cpu.h" > #include "libavutil/imgutils.h" > #include "libavutil/internal.h" > #include "libavutil/opt.h" > #include "libavutil/pixdesc.h" > #include "libavutil/pixelutils.h" > +#include "libavutil/x86/cpu.h" > > #include "avfilter.h" > #include "internal.h" > @@ -246,7 +248,7 @@ static int blend_frames(AVFilterContext *ctx, int > interpolate) > av_frame_copy_props(s->work, s->f0); > > ff_dlog(ctx, "blend_frames() INTERPOLATE to create work frame\n"); > - ctx->internal->execute(ctx, filter_slice, &td, NULL, > FFMIN(outlink->h, ff_filter_get_nb_threads(ctx))); > + ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(FFMAX(1, > outlink->h >> 2), ff_filter_get_nb_threads(ctx))); > return 1; > } > return 0; > @@ -347,6 +349,11 @@ static void blend_frames_c(BLEND_FUNC_PARAMS) > } > } > > +void ff_blend_frames_ssse3(BLEND_FUNC_PARAMS); > +void ff_blend_frames_avx2(BLEND_FUNC_PARAMS); > +void ff_blend_frames16_sse4(BLEND_FUNC_PARAMS); > +void ff_blend_frames16_avx2(BLEND_FUNC_PARAMS); > + > static void blend_frames16_c(BLEND_FUNC_PARAMS) > { > int line, pixel; > @@ -371,6 +378,7 @@ static int config_input(AVFilterLink *inlink) > AVFilterContext *ctx = inlink->dst; > FrameRateContext *s = ctx->priv; > const AVPixFmtDescriptor *pix_desc = av_pix_fmt_desc_get(inlink->format); > + int cpu_flags = av_get_cpu_flags(); > int plane; > > for (plane = 0; plane < 4; plane++) { > @@ -389,10 +397,20 @@ static int config_input(AVFilterLink *inlink) > > if (s->bitdepth == 8) { > s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH8; > - s->blend = blend_frames_c; > + if (ARCH_X86 && EXTERNAL_AVX2_FAST(cpu_flags)) > + s->blend = ff_blend_frames_avx2; > + else if (ARCH_X86 && EXTERNAL_SSSE3(cpu_flags)) > + s->blend = ff_blend_frames_ssse3; > + else > + s->blend = blend_frames_c; > } else { > s->blend_factor_max = 1 << BLEND_FACTOR_DEPTH16; > - s->blend = blend_frames16_c; > + if (ARCH_X86 && EXTERNAL_AVX2_FAST(cpu_flags)) > + s->blend = ff_blend_frames16_avx2; > + else if (ARCH_X86 && EXTERNAL_SSE4(cpu_flags)) > + s->blend = ff_blend_frames16_sse4; > + else > + s->blend = blend_frames16_c;
The simd function pointer initialization and the respective prototypes should be in a separate file in the x86 folder. In here you should only have something like if (ARCH_X86) ff_blend_frames_init_x86(s); Then the corresponding pointer initialization inside that function. The prototype for ff_blend_frames_init_x86() should be in a new header. See how vf_blend (and many other filters) do. > } > > return 0; _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel