--- Based on an idea from Ronald mentioend in an earlier thread about this function.
It works and passes FATE, however I'm sure some aspects can be done easier or cleaner, so please let me know. libavfilter/x86/vf_w3fdif.asm | 37 ++++++++++++++++++++++++++++++++++--- libavfilter/x86/vf_w3fdif_init.c | 2 +- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/libavfilter/x86/vf_w3fdif.asm b/libavfilter/x86/vf_w3fdif.asm index c3c73ea..35768c3 100644 --- a/libavfilter/x86/vf_w3fdif.asm +++ b/libavfilter/x86/vf_w3fdif.asm @@ -102,14 +102,22 @@ cglobal w3fdif_complex_low, 4, 7, 8, 0, work_line, in_lines_cur0, coef, linesize REP_RET %if ARCH_X86_64 - cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize +%else +cglobal w3fdif_simple_high, 4, 7, 8, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize +%endif movq m2, [coefq] - DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2 +%if ARCH_X86_64 + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, linesize, offset, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2 + mov offsetq, 0 +%else + DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, in_lines_cur1, in_lines_cur2, in_lines_adj1, in_lines_adj2 + %define linesized dword r4m +%endif + pshufd m0, m2, q0000 SPLATW m2, m2, 2 pxor m7, m7 - mov offsetq, 0 mov in_lines_cur2q, [in_lines_cur0q+gprsize*2] mov in_lines_cur1q, [in_lines_cur0q+gprsize] mov in_lines_cur0q, [in_lines_cur0q] @@ -117,8 +125,21 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, mov in_lines_adj1q, [in_lines_adj0q+gprsize] mov in_lines_adj0q, [in_lines_adj0q] +%if ARCH_X86_32 + sub in_lines_cur1q, in_lines_cur0q + sub in_lines_cur2q, in_lines_cur0q + sub in_lines_adj0q, in_lines_cur0q + sub in_lines_adj1q, in_lines_cur0q + sub in_lines_adj2q, in_lines_cur0q + %define offsetq in_lines_cur0q +%endif + .loop: +%if ARCH_X86_64 movh m3, [in_lines_cur0q+offsetq] +%else + movh m3, [in_lines_cur0q] +%endif movh m4, [in_lines_cur1q+offsetq] punpcklbw m3, m7 punpcklbw m4, m7 @@ -143,15 +164,25 @@ cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, in_lines_adj0, pmaddwd m6, m2 paddd m3, m5 paddd m4, m6 +%if ARCH_X86_64 paddd m3, [work_lineq+offsetq*4] paddd m4, [work_lineq+offsetq*4+mmsize] mova [work_lineq+offsetq*4], m3 mova [work_lineq+offsetq*4+mmsize], m4 +%else + paddd m3, [work_lineq] + paddd m4, [work_lineq+mmsize] + mova [work_lineq], m3 + mova [work_lineq+mmsize], m4 + add work_lineq, mmsize*2 +%endif add offsetq, mmsize/2 sub linesized, mmsize/2 jg .loop REP_RET +%if ARCH_X86_64 + cglobal w3fdif_complex_high, 5, 13, 10, 0, work_line, in_lines_cur0, in_lines_adj0, coef, linesize movq m0, [coefq+0] movd m4, [coefq+8] diff --git a/libavfilter/x86/vf_w3fdif_init.c b/libavfilter/x86/vf_w3fdif_init.c index 72ea657..9bf06e8 100644 --- a/libavfilter/x86/vf_w3fdif_init.c +++ b/libavfilter/x86/vf_w3fdif_init.c @@ -51,12 +51,12 @@ av_cold void ff_w3fdif_init_x86(W3FDIFDSPContext *dsp) if (EXTERNAL_SSE2(cpu_flags)) { dsp->filter_simple_low = ff_w3fdif_simple_low_sse2; + dsp->filter_simple_high = ff_w3fdif_simple_high_sse2; dsp->filter_complex_low = ff_w3fdif_complex_low_sse2; dsp->filter_scale = ff_w3fdif_scale_sse2; } if (ARCH_X86_64 && EXTERNAL_SSE2(cpu_flags)) { - dsp->filter_simple_high = ff_w3fdif_simple_high_sse2; dsp->filter_complex_high = ff_w3fdif_complex_high_sse2; } } -- 2.6.2.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel