On 10/1/2015 2:25 PM, Paul B Mahol wrote: > Signed-off-by: Paul B Mahol <one...@gmail.com> > --- > libavfilter/maskedmerge.h | 39 +++++++++++++++++++++ > libavfilter/vf_maskedmerge.c | 33 ++++++------------ > libavfilter/x86/Makefile | 2 ++ > libavfilter/x86/vf_maskedmerge.asm | 66 > +++++++++++++++++++++++++++++++++++ > libavfilter/x86/vf_maskedmerge_init.c | 39 +++++++++++++++++++++ > 5 files changed, 156 insertions(+), 23 deletions(-) > create mode 100644 libavfilter/maskedmerge.h > create mode 100644 libavfilter/x86/vf_maskedmerge.asm > create mode 100644 libavfilter/x86/vf_maskedmerge_init.c > > diff --git a/libavfilter/maskedmerge.h b/libavfilter/maskedmerge.h > new file mode 100644 > index 0000000..b198e65 > --- /dev/null > +++ b/libavfilter/maskedmerge.h > @@ -0,0 +1,39 @@ > +/* > + * Copyright (c) 2015 Paul B Mahol > + * > + * This file is part of FFmpeg. > + * > + * FFmpeg is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2.1 of the License, or (at your option) any later version. > + * > + * FFmpeg is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with FFmpeg; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > + */ > + > +#include "avfilter.h" > +#include "framesync.h" > + > +typedef struct MaskedMergeContext { > + const AVClass *class; > + int width[4], height[4]; > + int nb_planes; > + int planes; > + int half, depth; > + FFFrameSync fs; > + > + void (*maskedmerge)(const uint8_t *bsrc, int blinesize, > + const uint8_t *osrc, int olinesize, > + const uint8_t *msrc, int mlinesize, > + uint8_t *dst, int dlinesize, int w, int h, > + int half, int shift);
Make the pointers the first four arguments, followed by the linesize ones, then the rest. It will allow you to get this working on x86_32 with some changes. Also, linesize arguments should be ptrdiff_t. [...] > diff --git a/libavfilter/x86/vf_maskedmerge.asm > b/libavfilter/x86/vf_maskedmerge.asm > new file mode 100644 > index 0000000..462674a > --- /dev/null > +++ b/libavfilter/x86/vf_maskedmerge.asm > @@ -0,0 +1,66 @@ > +;***************************************************************************** > +;* x86-optimized functions for maskedmerge filter > +;* > +;* Copyright (C) 2015 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;***************************************************************************** > + > +%include "libavutil/x86/x86util.asm" If this is x86_64 only for now, then you need to put everything under this line inside an "%if ARCH_X86_64" preprocessor check. > + > +SECTION_RODATA > + > +pw_128: times 8 dw 128 > +pw_256: times 8 dw 256 > + > +SECTION .text > + > +INIT_XMM sse2 > +cglobal maskedmerge8, 10, 11, 3, 0, bsrc, blinesize, osrc, olinesize, msrc, > mlinesize, dst, dlinesize, w, h You can remove the 0 if you're not reserving stack space. It's an optional parameter. Also, you're using more than 3 xmm regs. > + mova m7, [pw_128] > + pxor m6, m6 > +.nextrow: > + mov r10q, 0 > + %define x r10q > + > + .loop: > + movh m0, [bsrcq + x] > + movh m1, [osrcq + x] > + movh m3, [msrcq + x] > + mova m4, [pw_256] You're not using m2, so you can store pw_256 on it outside the loop like you did for pw_128. Much faster than constantly loading it from memory. For that matter m5 is also unused. > + punpcklbw m0, m6 > + punpcklbw m1, m6 > + punpcklbw m3, m6 > + psubw m4, m3 > + pmullw m4, m0 > + pmullw m1, m3 > + paddw m1, m4 > + paddw m1, m7 > + psrlw m1, 8 > + packuswb m1, m1 > + movh [dstq + x], m1 > + add r10q, mmsize / 2 > + cmp r10q, wq > + jl .loop > + > + lea bsrcq, [bsrcq+blinesizeq] > + lea osrcq, [osrcq+olinesizeq] > + lea msrcq, [msrcq+mlinesizeq] > + lea dstq, [dstq+dlinesizeq] These are simple sums, so just use add. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel