On 10/8/15, James Almer <jamr...@gmail.com> wrote: > On 10/8/2015 2:02 PM, Paul B Mahol wrote: >> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm >> new file mode 100644 >> index 0000000..96b61d7 >> --- /dev/null >> +++ b/libavfilter/x86/vf_w3fdif.asm >> @@ -0,0 +1,284 @@ >> +;***************************************************************************** >> +;* x86-optimized functions for w3fdif filter >> +;* >> +;* Copyright (c) 2015 Paul B Mahol >> +;* >> +;* This file is part of FFmpeg. >> +;* >> +;* FFmpeg is free software; you can redistribute it and/or >> +;* modify it under the terms of the GNU Lesser General Public >> +;* License as published by the Free Software Foundation; either >> +;* version 2.1 of the License, or (at your option) any later version. >> +;* >> +;* FFmpeg is distributed in the hope that it will be useful, >> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of >> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >> +;* Lesser General Public License for more details. >> +;* >> +;* You should have received a copy of the GNU Lesser General Public >> +;* License along with FFmpeg; if not, write to the Free Software >> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA >> 02110-1301 USA >> +;****************************************************************************** >> + >> +%include "libavutil/x86/x86util.asm" >> + >> +SECTION_RODATA >> + >> +pd_0: times 4 dd 0 > > Just use pxor to zero a register. > >> +pd_2_23: times 4 dd 256*256*128 >> + >> +SECTION .text >> + >> +INIT_XMM sse4 >> +cglobal w3fdif_scale, 3, 3, 3, 0, out_pixel, work_pixel, linesize >> + mova m1, [pd_0] >> + mova m2, [pd_2_23] >> + shr linesized, 2 >> + >> + .loop >> + mova m0, [work_pixelq] >> + pmaxsd m0, m1 >> + pminsd m0, m2 > > You can emulate these two using sse2 instructions. See CLIPD_SSE2 (using > float conversion) and CLIPD_MMX in x86util.asm > >> + psrld m0, 15 >> + packusdw m0, m0 >> + packuswb m0, m0 >> + movd [out_pixelq], m0 >> + add out_pixelq, mmsize/4 >> + add work_pixelq, mmsize >> + sub linesized, 1 >> + jg .loop >> +REP_RET >> + >> +INIT_XMM sse2 >> +cglobal w3fdif_simple_low, 4, 6, 5, 0, work_line, in_lines_cur0, coef, >> linesize >> + movd m0, [coefq+0] >> + movd m1, [coefq+2] > > movd m1, [coefq] > SPLATW m0, m1, 0 > SPLATW m1, m1, 1 > >> + SPLATW m0, m0 >> + SPLATW m1, m1 >> + shr linesized, 3 >> + mov r4q, 0 >> + mov r5q, [in_lines_cur0q + gprsize] >> + mov in_lines_cur0q, [in_lines_cur0q] >> + %define in_lines_cur1q r5q >> + >> + .loop >> + movh m2, [in_lines_cur0q+r4q] >> + movh m3, [in_lines_cur1q+r4q] >> + pxor m4, m4 >> + punpcklbw m2, m4 >> + punpcklbw m3, m4 >> + SBUTTERFLY wd, 2, 3, 4 >> + pmaddwd m2, m0 >> + pmaddwd m3, m1 >> + mova [work_lineq+r4q*4], m2 >> + mova [work_lineq+r4q*4+mmsize], m3 >> + add r4q, 8 >> + sub linesized, 1 >> + jg .loop >> +REP_RET >> + >> +cglobal w3fdif_simple_high, 5, 10, 8, 0, work_line, in_lines_cur0, >> in_lines_adj0, coef, linesize > > This is clearly not x86_32 friendly, so you will either have to get it > working > using 7 regs, or mark it as x86_64 only. > >> + movd m0, [coefq+0] >> + movd m1, [coefq+2] >> + movd m2, [coefq+4] > > movq m2, [coefq] > SPLATW m0, m2, 0 > SPLATW m1, m2, 1 > SPLATW m2, m2, 2 > > And so for every function. > >> + SPLATW m0, m0 >> + SPLATW m1, m1 >> + SPLATW m2, m2 >> + SBUTTERFLY wd, 0, 1, 7 >> + shr linesized, 3 > > Seems pointless if the only other instruction using this reg is a sub at the > end of the loop. > Can't you do the neg trick on linesize and use that as part of the effective > addresses inside the loop, instead of a zeroed r5q? > >> + mov r5q, 0 >> + mov r7q, [in_lines_cur0q+gprsize*2] >> + mov r6q, [in_lines_cur0q+gprsize] >> + mov in_lines_cur0q, [in_lines_cur0q] >> + %define in_lines_cur1q r6q >> + %define in_lines_cur2q r7q > > Instead of defining their names here, just name them in the cglobal line. > You can name registers there that aren't function arguments just fine. > > Both the above suggestions apply to other functions as well.
I prefer this approach, its more readable. > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel