On 8/6/17, Henrik Gramner <hen...@gramner.com> wrote: > On Sat, Aug 5, 2017 at 9:10 PM, Ivan Kalvachev <ikalvac...@gmail.com> wrote: >> +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm >> +%if cpuflag(avx2) >> + vbroadcastss %1, %2 ; ymm, xmm >> +%elif cpuflag(avx) >> + %ifnum sizeof%2 ; avx1 register >> + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, >> imm > > Nit: Use shufps instead of vpermilps, it's one byte shorter but > otherwise identical in this case. > > c5 e8 c6 ca 00 vshufps xmm1,xmm2,xmm2,0x0 > c4 e3 79 04 ca 00 vpermilps xmm1,xmm2,0x0
It's also 1 latency cycle less on some old AMD cpu's. Done. >> +%macro BLENDVPS 3 ; dst/src_a, src_b, mask >> +%if cpuflag(avx) >> + blendvps %1, %1, %2, %3 >> +%elif cpuflag(sse4) >> + %if notcpuflag(avx) >> + %ifnidn %3,xmm0 >> + %error sse41 blendvps uses xmm0 as default 3d operand, you >> used %3 >> + %endif >> + %endif > > notcpuflag(avx) is redundant (it's always true since AVX uses the first > branch). Done. This is a remnant from the time I had label to turn on and off different implementations. Best Regards _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > http://ffmpeg.org/mailman/listinfo/ffmpeg-devel >
From a43da9061c08dcf4cb6ecd7c8eaad074cdb551d1 Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvac...@gmail.com> Date: Sat, 5 Aug 2017 20:18:50 +0300 Subject: [PATCH 1/6] Add macros to x86util.asm . Improved version of VBROADCASTSS that works like the avx2 instruction. Emulation of vpbroadcastd. Horizontal sum HSUMPS that places the result in all elements. Emulation of blendvps and pblendvb. Signed-off-by: Ivan Kalvachev <ikalvac...@gmail.com> --- libavutil/x86/x86util.asm | 106 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 98 insertions(+), 8 deletions(-) diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm index cc7d272cad..e1220dfc1a 100644 --- a/libavutil/x86/x86util.asm +++ b/libavutil/x86/x86util.asm @@ -832,14 +832,25 @@ pmaxsd %1, %2 %endmacro -%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32 -%if cpuflag(avx) - vbroadcastss %1, %2 -%else ; sse -%ifnidn %1, %2 - movss %1, %2 -%endif - shufps %1, %1, 0 +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vbroadcastss %1, %2 +%elif cpuflag(avx) + %ifnum sizeof%2 ; avx1 register + shufps xmm%1, xmm%2, xmm%2, q0000 + %if sizeof%1 >= 32 ; mmsize>=32 + vinsertf128 %1, %1, xmm%1, 1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 + %endif +%else + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory + movss %1, %2 + shufps %1, %1, 0 + %endif %endif %endmacro @@ -854,6 +865,21 @@ %endif %endmacro +%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm +%if cpuflag(avx2) + vpbroadcastd %1, %2 +%elif cpuflag(avx) && sizeof%1 >= 32 + %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss +%else + %ifnum sizeof%2 ; sse2 register + pshufd %1, %2, q0000 + %else ; sse memory + movd %1, %2 + pshufd %1, %1, 0 + %endif +%endif +%endmacro + %macro SHUFFLE_MASK_W 8 %rep 8 %if %1>=0x80 @@ -918,3 +944,67 @@ movhlps %1, %2 ; may cause an int/float domain transition and has a dependency on dst %endif %endmacro + +; Horizontal Sum of Packed Single precision floats +; The resulting sum is in all elements. +%macro HSUMPS 2 ; dst/src, tmp +%if cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) + addps %1, %2 + %endif + shufps %2, %1, %1, q1032 + addps %1, %2 + shufps %2, %1, %1, q0321 + addps %1, %2 +%else ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 + shufps %1, %1, q1032 + addps %1, %2 + movaps %2, %1 + shufps %1, %1, q0321 + addps %1, %2 + ; all %1 members should be equal for as long as float a+b==b+a +%endif +%endmacro + +; Emulate blendvps if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro BLENDVPS 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + blendvps %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + blendvps %1, %2, %3 +%else + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%endif +%endmacro + +; Emulate pblendvb if not available +; +; src_b is destroyed when using emulation with logical operands +; SSE41 blendv instruction is hard coded to use xmm0 as mask +%macro PBLENDVB 3 ; dst/src_a, src_b, mask +%if cpuflag(avx) + %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32 + %error pblendb not possible with ymm on avx1, try blendvps. + %endif + pblendvb %1, %1, %2, %3 +%elif cpuflag(sse4) + %ifnidn %3,xmm0 + %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3 + %endif + pblendvb %1, %2, %3 +%else + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%endif +%endmacro -- 2.14.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel