--- The reason why this function uses SSE4.1 is the roundps instruction. Would love to find a way to truncate a float to integer in SSE2.
--- libavfilter/x86/vf_blend.asm | 32 ++++++++++++++++++++++++++++++++ libavfilter/x86/vf_blend_init.c | 6 ++++++ 2 files changed, 38 insertions(+) diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm index a5ea74c..dac04d7 100644 --- a/libavfilter/x86/vf_blend.asm +++ b/libavfilter/x86/vf_blend.asm @@ -24,6 +24,7 @@ SECTION_RODATA +ps_255: times 4 dd 255.0 pw_1: times 8 dw 1 pw_128: times 8 dw 128 pw_255: times 8 dw 255 @@ -285,3 +286,34 @@ INIT_XMM sse2 BLEND_ABS INIT_XMM ssse3 BLEND_ABS + +INIT_XMM sse4 +BLEND_INIT divide, 4 + pxor m2, m2 + mova m3, [ps_255] +.nextrow: + mov xq, widthq + + .loop: + movd m0, [topq + xq] ; 000000xx + movd m1, [bottomq + xq] + punpcklbw m0, m2 ; 00000x0x + punpcklbw m1, m2 + punpcklwd m0, m2 ; 000x000x + punpcklwd m1, m2 + + cvtdq2ps m0, m0 + cvtdq2ps m1, m1 + divps m0, m1 ; a / b + mulps m0, m3 ; a / b * 255 + roundps m0, m0, 3 ; truncate + minps m0, m3 + cvtps2dq m0, m0 + + packusdw m0, m0 ; 00000x0x + packuswb m0, m0 ; 000000xx + movd [dstq + xq], m0 + add xq, mmsize / 4 + + jl .loop +BLEND_END diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c index a6baf94..f542870 100644 --- a/libavfilter/x86/vf_blend_init.c +++ b/libavfilter/x86/vf_blend_init.c @@ -48,6 +48,7 @@ BLEND_FUNC(difference, sse2) BLEND_FUNC(difference, ssse3) BLEND_FUNC(negation, sse2) BLEND_FUNC(negation, ssse3) +BLEND_FUNC(divide, sse4) av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) { @@ -79,4 +80,9 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit) case BLEND_NEGATION: param->blend = ff_blend_negation_ssse3; break; } } + if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1 && !is_16bit) { + switch (param->mode) { + case BLEND_DIVIDE: param->blend = ff_blend_divide_sse4; break; + } + } } -- 2.1.4 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel