On 8/6/17, Henrik Gramner <hen...@gramner.com> wrote:
> On Sat, Aug 5, 2017 at 9:10 PM, Ivan Kalvachev <ikalvac...@gmail.com> wrote:
>> +%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
>> +%if cpuflag(avx2)
>> +    vbroadcastss  %1, %2                    ; ymm, xmm
>> +%elif cpuflag(avx)
>> +    %ifnum sizeof%2         ; avx1 register
>> +        vpermilps  xmm%1, xmm%2, q0000      ; xmm, xmm, imm || ymm, ymm,
>> imm
>
> Nit: Use shufps instead of vpermilps, it's one byte shorter but
> otherwise identical in this case.
>
> c5 e8 c6 ca 00    vshufps xmm1,xmm2,xmm2,0x0
> c4 e3 79 04 ca 00 vpermilps xmm1,xmm2,0x0

It's also 1 latency cycle less on some old AMD cpu's.

Done.


>> +%macro BLENDVPS 3 ; dst/src_a, src_b, mask
>> +%if cpuflag(avx)
>> +    blendvps  %1, %1, %2, %3
>> +%elif cpuflag(sse4)
>> +    %if notcpuflag(avx)
>> +        %ifnidn %3,xmm0
>> +            %error sse41 blendvps uses xmm0 as default 3d operand, you
>> used %3
>> +        %endif
>> +    %endif
>
> notcpuflag(avx) is redundant (it's always true since AVX uses the first
> branch).

Done.

This is a remnant from the time I had label to turn on and off
different implementations.


Best Regards

 _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
From a43da9061c08dcf4cb6ecd7c8eaad074cdb551d1 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvac...@gmail.com>
Date: Sat, 5 Aug 2017 20:18:50 +0300
Subject: [PATCH 1/6] Add macros to x86util.asm .

Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

Signed-off-by: Ivan Kalvachev <ikalvac...@gmail.com>
---
 libavutil/x86/x86util.asm | 106 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 98 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index cc7d272cad..e1220dfc1a 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -832,14 +832,25 @@
     pmaxsd  %1, %2
 %endmacro
 
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
-    vbroadcastss %1, %2
-%else ; sse
-%ifnidn %1, %2
-    movss        %1, %2
-%endif
-    shufps       %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vbroadcastss  %1, %2
+%elif cpuflag(avx)
+    %ifnum sizeof%2         ; avx1 register
+        shufps  xmm%1, xmm%2, xmm%2, q0000
+        %if sizeof%1 >= 32  ; mmsize>=32
+            vinsertf128  %1, %1, xmm%1, 1
+        %endif
+    %else                   ; avx1 memory
+        vbroadcastss  %1, %2
+    %endif
+%else
+    %ifnum sizeof%2         ; sse register
+        shufps  %1, %2, %2, q0000
+    %else                   ; sse memory
+        movss   %1, %2
+        shufps  %1, %1, 0
+    %endif
 %endif
 %endmacro
 
@@ -854,6 +865,21 @@
 %endif
 %endmacro
 
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vpbroadcastd  %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+    %ifnum sizeof%2         ; sse2 register
+        pshufd  %1, %2, q0000
+    %else                   ; sse memory
+        movd    %1, %2
+        pshufd  %1, %1, 0
+    %endif
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
@@ -918,3 +944,67 @@
     movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
 %endif
 %endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+    %if sizeof%1>=32  ; avx
+        vperm2f128  %2, %1, %1, (0)*16+(1)
+        addps       %1, %2
+    %endif
+    shufps      %2, %1, %1, q1032
+    addps       %1, %2
+    shufps      %2, %1, %1, q0321
+    addps       %1, %2
+%else  ; this form is a bit faster than the short avx-like emulation.
+    movaps      %2, %1
+    shufps      %1, %1, q1032
+    addps       %1, %2
+    movaps      %2, %1
+    shufps      %1, %1, q0321
+    addps       %1, %2
+    ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    blendvps  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+    %endif
+    blendvps  %1, %2, %3
+%else
+    xorps  %2, %1
+    andps  %2, %3
+    xorps  %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error pblendb not possible with ymm on avx1, try blendvps.
+    %endif
+    pblendvb  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+    %endif
+    pblendvb  %1, %2, %3
+%else
+    pxor  %2, %1
+    pand  %2, %3
+    pxor  %1, %2
+%endif
+%endmacro
-- 
2.14.0

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Reply via email to