On Thu, Jan 7, 2016 at 4:18 AM, James Almer <jamr...@gmail.com> wrote: > On 1/6/2016 11:54 PM, Hendrik Leppkes wrote: >> --- >> Based on an idea from Ronald mentioend in an earlier thread about this >> function. >> >> It works and passes FATE, however I'm sure some aspects can be done easier >> or cleaner, so please let me know. >> >> >> libavfilter/x86/vf_w3fdif.asm | 37 ++++++++++++++++++++++++++++++++++--- >> libavfilter/x86/vf_w3fdif_init.c | 2 +- >> 2 files changed, 35 insertions(+), 4 deletions(-) >> >> diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm >> index c3c73ea..35768c3 100644 >> --- a/libavfilter/x86/vf_w3fdif.asm >> +++ b/libavfilter/x86/vf_w3fdif.asm >> @@ -102,14 +102,22 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, >> in_lines_cur0, coef, linesize >> REP_RET >> >> %if ARCH_X86_64 >> - >> cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, >> in_lines_adj0, coef, linesize >> +%else >> +cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, >> in_lines_adj0, coef, linesize >> +%endif >> movq m2, [coefq] >> - DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, >> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2 >> +%if ARCH_X86_64 >> + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, linesize, >> offset, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2 > > This broke x86_64. Leave it as it was above.
The change wasn't intentional, I mangled that when moving things around apparently. > >> + mov offsetq, 0 > > Since you're moving this take the chance to replace it with a xor. Done > >> +%else >> + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, >> in_lines_cur2, in_lines_adj1, in_lines_adj2 >> + %define linesized dword r4m > > Nit: r4mp instead of dword r4m I stole dword r4m from audio_convert.asm, but apparently r4mp aliases to dword r4m, so changed anyway. > >> +%endif >> + >> pshufd m0, m2, q0000 >> SPLATW m2, m2, 2 >> pxor m7, m7 >> - mov offsetq, 0 >> mov in_lines_cur2q, [in_lines_cur0q+gprsize*2] >> mov in_lines_cur1q, [in_lines_cur0q+gprsize] >> mov in_lines_cur0q, [in_lines_cur0q] >> @@ -117,8 +125,21 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, >> in_lines_cur0, in_lines_adj0, >> mov in_lines_adj1q, [in_lines_adj0q+gprsize] >> mov in_lines_adj0q, [in_lines_adj0q] >> >> +%if ARCH_X86_32 >> + sub in_lines_cur1q, in_lines_cur0q >> + sub in_lines_cur2q, in_lines_cur0q >> + sub in_lines_adj0q, in_lines_cur0q >> + sub in_lines_adj1q, in_lines_cur0q >> + sub in_lines_adj2q, in_lines_cur0q >> + %define offsetq in_lines_cur0q >> +%endif >> + >> .loop: >> +%if ARCH_X86_64 >> movh m3, [in_lines_cur0q+offsetq] >> +%else >> + movh m3, [in_lines_cur0q] >> +%endif >> movh m4, [in_lines_cur1q+offsetq] >> punpcklbw m3, m7 >> punpcklbw m4, m7 >> @@ -143,15 +164,25 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, >> in_lines_cur0, in_lines_adj0, >> pmaddwd m6, m2 >> paddd m3, m5 >> paddd m4, m6 >> +%if ARCH_X86_64 >> paddd m3, [work_lineq+offsetq*4] >> paddd m4, [work_lineq+offsetq*4+mmsize] >> mova [work_lineq+offsetq*4], m3 >> mova [work_lineq+offsetq*4+mmsize], m4 >> +%else >> + paddd m3, [work_lineq] >> + paddd m4, [work_lineq+mmsize] >> + mova [work_lineq], m3 >> + mova [work_lineq+mmsize], m4 >> + add work_lineq, mmsize*2 >> +%endif >> add offsetq, mmsize/2 >> sub linesized, mmsize/2 >> jg .loop >> REP_RET >> >> +%if ARCH_X86_64 >> + >> cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, >> in_lines_adj0, coef, linesize >> movq m0, [coefq+0] >> movd m4, [coefq+8] >> diff --git a/libavfilter/x86/vf_w3fdif_init.c >> b/libavfilter/x86/vf_w3fdif_init.c >> index 72ea657..9bf06e8 100644 >> --- a/libavfilter/x86/vf_w3fdif_init.c >> +++ b/libavfilter/x86/vf_w3fdif_init.c >> @@ -51,12 +51,12 @@ av_cold void ff_w3fdif_init_x86(W3FDIFDSPContext *dsp) >> >> if (EXTERNAL_SSE2(cpu_flags)) { >> dsp->filter_simple_low = ff_w3fdif_simple_low_sse2; >> + dsp->filter_simple_high = ff_w3fdif_simple_high_sse2; >> dsp->filter_complex_low = ff_w3fdif_complex_low_sse2; >> dsp->filter_scale = ff_w3fdif_scale_sse2; >> } >> >> if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) { >> - dsp->filter_simple_high = ff_w3fdif_simple_high_sse2; >> dsp->filter_complex_high = ff_w3fdif_complex_high_sse2; >> } >> } >> > > Seems to work. Maybe it can be improved but it should be good as is. > > And to answer your question, no, i wasn't working on this. I got > distracted with other filters :P Will re-test both x86 and x64 in a bit and push in a few hours if no further comments show up. Thanks. - Hendrik _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel