On 11/12/2017 4:15 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <one...@gmail.com> > --- > libavfilter/threshold.h | 51 +++++++++++++++++++++++++++ > libavfilter/vf_threshold.c | 32 +++++------------ > libavfilter/x86/Makefile | 2 ++ > libavfilter/x86/vf_threshold.asm | 69 > +++++++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_threshold_init.c | 41 ++++++++++++++++++++++ > 5 files changed, 171 insertions(+), 24 deletions(-) > create mode 100644 libavfilter/threshold.h > create mode 100644 libavfilter/x86/vf_threshold.asm > create mode 100644 libavfilter/x86/vf_threshold_init.c > > diff --git a/libavfilter/threshold.h b/libavfilter/threshold.h > new file mode 100644 > index 0000000000..8b55ad6ba1 > --- /dev/null > +++ b/libavfilter/threshold.h > @@ -0,0 +1,51 @@ > +/* > + * Copyright (c) 2016 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#ifndef AVFILTER_THRESHOLD_H > +#define AVFILTER_THRESHOLD_H > + > +#include "avfilter.h" > +#include "framesync.h" > + > +typedef struct ThresholdContext { > + const AVClass *class; > + > + int depth; > + int planes; > + int bpc; > + > + int nb_planes; > + int width[4], height[4]; > + > + void (*threshold)(const uint8_t *in, const uint8_t *threshold, > + const uint8_t *min, const uint8_t *max, > + uint8_t *out, > + ptrdiff_t ilinesize, ptrdiff_t tlinesize, > + ptrdiff_t flinesize, ptrdiff_t slinesize, > + ptrdiff_t olinesize, > + int w, int h); > + > + AVFrame *frames[4]; > + FFFrameSync fs; > +} ThresholdContext; > + > +void ff_threshold_init_x86(ThresholdContext *s); > + > +#endif /* AVFILTER_THRESHOLD_H */ > diff --git a/libavfilter/vf_threshold.c b/libavfilter/vf_threshold.c > index 88f6ef28d7..4183b353d2 100644 > --- a/libavfilter/vf_threshold.c > +++ b/libavfilter/vf_threshold.c > @@ -31,27 +31,7 @@ > #include "framesync.h" > #include "internal.h" > #include "video.h" > - > -typedef struct ThresholdContext { > - const AVClass *class; > - > - int planes; > - int bpc; > - > - int nb_planes; > - int width[4], height[4]; > - > - void (*threshold)(const uint8_t *in, const uint8_t *threshold, > - const uint8_t *min, const uint8_t *max, > - uint8_t *out, > - ptrdiff_t ilinesize, ptrdiff_t tlinesize, > - ptrdiff_t flinesize, ptrdiff_t slinesize, > - ptrdiff_t olinesize, > - int w, int h); > - > - AVFrame *frames[4]; > - FFFrameSync fs; > -} ThresholdContext; > +#include "threshold.h" > > #define OFFSET(x) offsetof(ThresholdContext, x) > #define FLAGS AV_OPT_FLAG_VIDEO_PARAM|AV_OPT_FLAG_FILTERING_PARAM > @@ -155,7 +135,7 @@ static void threshold8(const uint8_t *in, const uint8_t > *threshold, > in += ilinesize; > threshold += tlinesize; > min += flinesize; > - max += flinesize; > + max += slinesize;
Unrelated fix? > out += olinesize; > } > } > @@ -183,7 +163,7 @@ static void threshold16(const uint8_t *iin, const uint8_t > *tthreshold, > in += ilinesize / 2; > threshold += tlinesize / 2; > min += flinesize / 2; > - max += flinesize / 2; > + max += slinesize / 2; Same. > out += olinesize / 2; > } > } > @@ -203,8 +183,9 @@ static int config_input(AVFilterLink *inlink) > s->height[0] = s->height[3] = inlink->h; > s->width[1] = s->width[2] = AV_CEIL_RSHIFT(inlink->w, hsub); > s->width[0] = s->width[3] = inlink->w; > + s->depth = desc->comp[0].depth; > > - if (desc->comp[0].depth == 8) { > + if (s->depth == 8) { > s->threshold = threshold8; > s->bpc = 1; > } else { > @@ -212,6 +193,9 @@ static int config_input(AVFilterLink *inlink) > s->bpc = 2; > } > > + if (ARCH_X86) > + ff_threshold_init_x86(s); > + > return 0; > } > > diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile > index 3431625883..c10f4d5538 100644 > --- a/libavfilter/x86/Makefile > +++ b/libavfilter/x86/Makefile > @@ -20,6 +20,7 @@ OBJS-$(CONFIG_SPP_FILTER) += x86/vf_spp.o > OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim_init.o > OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d_init.o > OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend_init.o > +OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold_init.o > OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_tinterlace_init.o > OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume_init.o > OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif_init.o > @@ -46,6 +47,7 @@ X86ASM-OBJS-$(CONFIG_SHOWCQT_FILTER) += > x86/avf_showcqt.o > X86ASM-OBJS-$(CONFIG_SSIM_FILTER) += x86/vf_ssim.o > X86ASM-OBJS-$(CONFIG_STEREO3D_FILTER) += x86/vf_stereo3d.o > X86ASM-OBJS-$(CONFIG_TBLEND_FILTER) += x86/vf_blend.o > +X86ASM-OBJS-$(CONFIG_THRESHOLD_FILTER) += x86/vf_threshold.o > X86ASM-OBJS-$(CONFIG_TINTERLACE_FILTER) += x86/vf_interlace.o > X86ASM-OBJS-$(CONFIG_VOLUME_FILTER) += x86/af_volume.o > X86ASM-OBJS-$(CONFIG_W3FDIF_FILTER) += x86/vf_w3fdif.o > diff --git a/libavfilter/x86/vf_threshold.asm > b/libavfilter/x86/vf_threshold.asm > new file mode 100644 > index 0000000000..9db2f89aa8 > --- /dev/null > +++ b/libavfilter/x86/vf_threshold.asm > @@ -0,0 +1,69 @@ > +;***************************************************************************** > +;* x86-optimized functions for threshold filter > +;* > +;* Copyright (C) 2017 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;***************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +%if ARCH_X86_64 > + > +SECTION_RODATA > + > +pb_128: times 16 db 128 > + > +SECTION .text > + > +INIT_XMM sse4 > +cglobal threshold8, 13, 13, 8, in, threshold, min, max, out, ilinesize, > tlinesize, flinesize, slinesize, olinesize, w, h, x First, you're loading 13 arguments when there are only twelve. Second, you can easily make this work in x86_32. All the linesize values are constant and can be fetched from memory at the end of each row in the add instructions. Same with h in the sub. The only values you need in grps are the five pointers and x. And since you still have one reg after that, you can keep w in one as well. Look at my previous x86_32 ports of your asm. > + mov wd, dword wm > + mov hd, dword hm Either do movsxdifnidn wq, wd movsxdifnidn hq, hd Or change the cglobal arguments to "10, 13", to avoid loading them from memory a second time. > + mova m7, [pb_128] > + add inq, wq > + add thresholdq, wq > + add minq, wq > + add maxq, wq > + add outq, wq > + neg wq > +.nextrow: > + mov xq, wq > + > + .loop: > + movu m1, [inq + xq] > + movu m0, [thresholdq + xq] > + movu m2, [minq + xq] > + movu m3, [maxq + xq] > + pxor m0, m7 > + pxor m1, m7 You're using m0 to m3, then m7 for some reason. Use m4 instead and set the amount of xmm regs needed in cglobal to 5. > + pcmpgtb m0, m1 > + pblendvb m3, m2, m0 > + movu [outq + xq], m3 > + add xq, mmsize > + jl .loop > + > + add inq, ilinesizeq > + add thresholdq, tlinesizeq > + add minq, flinesizeq > + add maxq, slinesizeq > + add outq, olinesizeq > + sub hd, 1 > + jg .nextrow > +REP_RET Just use RET. > + > +%endif > diff --git a/libavfilter/x86/vf_threshold_init.c > b/libavfilter/x86/vf_threshold_init.c > new file mode 100644 > index 0000000000..e2bbae11d5 > --- /dev/null > +++ b/libavfilter/x86/vf_threshold_init.c > @@ -0,0 +1,41 @@ > +/* > + * Copyright (c) 2015 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "libavutil/attributes.h" > +#include "libavutil/cpu.h" > +#include "libavutil/x86/cpu.h" > +#include "libavfilter/threshold.h" > + > +void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold, > + const uint8_t *min, const uint8_t *max, > + uint8_t *out, > + ptrdiff_t ilinesize, ptrdiff_t tlinesize, > + ptrdiff_t flinesize, ptrdiff_t slinesize, > + ptrdiff_t olinesize, > + int w, int h); > + > +av_cold void ff_threshold_init_x86(ThresholdContext *s) > +{ > + int cpu_flags = av_get_cpu_flags(); > + > + if (ARCH_X86_64 && EXTERNAL_SSE4(cpu_flags) && s->depth == 8) { > + s->threshold = ff_threshold8_sse4; > + } > +} > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel