On 10/5/2015 6:49 AM, Paul B Mahol wrote: > diff --git a/libavfilter/x86/vf_stereo3d.asm b/libavfilter/x86/vf_stereo3d.asm > new file mode 100644 > index 0000000..269004b > --- /dev/null > +++ b/libavfilter/x86/vf_stereo3d.asm > @@ -0,0 +1,184 @@ > +;***************************************************************************** > +;* x86-optimized functions for stereo3d filter > +;* > +;* Copyright (C) 2015 Paul B Mahol > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 > USA > +;***************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +%if ARCH_X86_64 > + > +SECTION_RODATA > + > +; rgbrgbrgbrgb > +; rrrrggggbbbb > + > +shuf: db 0, 4, 8, 1,5, 9, 2, 6,10,3, 7,11,-1,-1,-1,-1 > +ex_r: db 0,-1,-1,-1,3,-1,-1,-1,6,-1,-1,-1, 9,-1,-1,-1 > +ex_g: db 1,-1,-1,-1,4,-1,-1,-1,7,-1,-1,-1,10,-1,-1,-1 > +ex_b: db 2,-1,-1,-1,5,-1,-1,-1,8,-1,-1,-1,11,-1,-1,-1 > + > +SECTION .text > + > +INIT_XMM sse4 > +cglobal anaglyph, 11, 13, 16, 3*6*mmsize, dst, lsrc, rsrc, dst_linesize, > l_linesize, r_linesize, width, height, ana_matrix_r, ana_matrix_g, > ana_matrix_b > + movd m10, [ana_matrix_rq+ 0] > + movd m11, [ana_matrix_rq+ 4] > + movd m12, [ana_matrix_rq+ 8] > + movd m13, [ana_matrix_rq+12] > + movd m14, [ana_matrix_rq+16] > + movd m15, [ana_matrix_rq+20] > + pshufd m10, m10, q0000 > + pshufd m11, m11, q0000 > + pshufd m12, m12, q0000 > + pshufd m13, m13, q0000 > + pshufd m14, m14, q0000 > + pshufd m15, m15, q0000
mova m13, [ana_matrix_rq + 0] movq m15, [ana_matrix_rq + 16] pshufd m10, m13, q0000 pshufd m11, m13, q1111 pshufd m12, m13, q2222 pshufd m13, m13, q3333 pshufd m14, m15, q0000 pshufd m15, m15, q1111 Will probably be faster. Also, you're not using m7 anywhere, and m13, m14 and m15 remain unused after the init code. You could keep four of the coeffs in them instead of using stack. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel