On 10/11/2015 3:11 PM, Ronald S. Bultje wrote: > Hi, > > On Sun, Oct 11, 2015 at 1:17 PM, James Almer <jamr...@gmail.com> wrote: > >> On 10/11/2015 4:31 AM, Paul B Mahol wrote: >>> On 10/11/15, James Almer <jamr...@gmail.com> wrote: >>>> Signed-off-by: James Almer <jamr...@gmail.com> >>>> --- >>>> libavfilter/x86/vf_w3fdif.asm | 16 +++++++--------- >>>> 1 file changed, 7 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/libavfilter/x86/vf_w3fdif.asm >> b/libavfilter/x86/vf_w3fdif.asm >>>> index f02319b..f2001a4 100644 >>>> --- a/libavfilter/x86/vf_w3fdif.asm >>>> +++ b/libavfilter/x86/vf_w3fdif.asm >>>> @@ -103,13 +103,11 @@ REP_RET >>>> >>>> %if ARCH_X86_64 >>>> >>>> -cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, in_lines_cur0, >>>> in_lines_adj0, coef, linesize >>>> +cglobal w3fdif_simple_high, 5, 9, 8, 0, work_line, in_lines_cur0, >>>> in_lines_adj0, coef, linesize >>>> movq m2, [coefq] >>>> DEFINE_ARGS work_line, in_lines_cur0, in_lines_adj0, >> in_lines_cur1, >>>> linesize, offset, in_lines_cur2, in_lines_adj1, in_lines_adj2 >>>> - SPLATW m0, m2, 0 >>>> - SPLATW m1, m2, 1 >>>> + pshufd m0, m2, q0000 >>>> SPLATW m2, m2, 2 >>>> - SBUTTERFLY wd, 0, 1, 7 >>>> pxor m7, m7 >>>> mov offsetq, 0 >>>> mov in_lines_cur2q, [in_lines_cur0q+gprsize*2] >>>> @@ -124,23 +122,23 @@ cglobal w3fdif_simple_high, 5, 9, 9, 0, work_line, >>>> in_lines_cur0, in_lines_adj0, >>>> movh m4, [in_lines_cur1q+offsetq] >>>> punpcklbw m3, m7 >>>> punpcklbw m4, m7 >>>> - SBUTTERFLY wd, 3, 4, 8 >>>> + SBUTTERFLY wd, 3, 4, 1 >>>> pmaddwd m3, m0 >>>> - pmaddwd m4, m1 >>>> + pmaddwd m4, m0 >>>> movh m5, [in_lines_adj0q+offsetq] >>>> movh m6, [in_lines_adj1q+offsetq] >>>> punpcklbw m5, m7 >>>> punpcklbw m6, m7 >>>> - SBUTTERFLY wd, 5, 6, 8 >>>> + SBUTTERFLY wd, 5, 6, 1 >>>> pmaddwd m5, m0 >>>> - pmaddwd m6, m1 >>>> + pmaddwd m6, m0 >>>> paddd m3, m5 >>>> paddd m4, m6 >>>> movh m5, [in_lines_cur2q+offsetq] >>>> movh m6, [in_lines_adj2q+offsetq] >>>> punpcklbw m5, m7 >>>> punpcklbw m6, m7 >>>> - SBUTTERFLY wd, 5, 6, 8 >>>> + SBUTTERFLY wd, 5, 6, 1 >>>> pmaddwd m5, m2 >>>> pmaddwd m6, m2 >>>> paddd m3, m5 >>>> -- >>>> 2.6.0 >>>> >>>> _______________________________________________ >>>> ffmpeg-devel mailing list >>>> ffmpeg-devel@ffmpeg.org >>>> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >>>> >>> >>> Cant this now be used on x32? >> > > Add to the data pointers directly (in_lines_cur0q and work_lineq). Then sub > all other curXq/adjXq from cur0q (on 32bit only) before the loop and you > have to adds (on 32bit) instead of one (on 64bit), but one reg less > (offset), making it 7, which means it works. > > Ronald
Ah, like it's being done in PACK_6CH from swr's audio_convert.asm For complex_high some stack ab/use will be needed (see PACK_8CH), but it should be doable. This way w3fdif will be able to fully dethrone yadif :P _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel