Improved version of VBROADCASTSS that works like the avx2 instruction. Emulation of vpbroadcastd. Horizontal sum HSUMPS that places the result in all elements. Emulation of blendvps and pblendvb.
From cf4dc8fcd974a845b91aaa8685c06fa145b01786 Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvac...@gmail.com> Date: Sat, 5 Aug 2017 20:18:50 +0300 Subject: [PATCH 1/6] Add macros to x86util.asm .
Improved version of VBROADCASTSS that works like the avx2 instruction. Emulation of vpbroadcastd. Horizontal sum HSUMPS that places the result in all elements. Emulation of blendvps and pblendvb. Signed-off-by: Ivan Kalvachev <ikalvac...@gmail.com> --- libavutil/x86/x86util.asm | 108 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 8 deletions(-) diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index cc7d272cad..d460ee5193 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -832,14 +832,25 @@ pmaxsd %1, %2 %endmacro -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse -%ifnidn %1, %2 - movss %1, %2 -%endif - shufps %1, %1, 0 +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 ; ymm, xmm +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 ; ymm, ymm, xmm, im + %endif + %else ; avx1 memory + vbroadcastss %1, %2 ; ymm, mm32 || xmm, m32 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif %endif %endmacro @@ -854,6 +865,21 @@ %endif %endmacro +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 @@ -918,3 +944,69 @@ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst %endif %endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %if notcpuflag(avx) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro -- 2.13.2
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel