[FFmpeg-devel] [PATCH] Add macros used in opus_pvq_search to x86util.asm

Ivan Kalvachev Sat, 05 Aug 2017 12:11:36 -0700

Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

From cf4dc8fcd974a845b91aaa8685c06fa145b01786 Mon Sep 17 00:00:00 2001
From: Ivan Kalvachev <ikalvac...@gmail.com>
Date: Sat, 5 Aug 2017 20:18:50 +0300
Subject: [PATCH 1/6] Add macros to x86util.asm .


Improved version of VBROADCASTSS that works like the avx2 instruction.
Emulation of vpbroadcastd.
Horizontal sum HSUMPS that places the result in all elements.
Emulation of blendvps and pblendvb.

Signed-off-by: Ivan Kalvachev <ikalvac...@gmail.com>
---
 libavutil/x86/x86util.asm | 108 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 100 insertions(+), 8 deletions(-)

diff --git a/libavutil/x86/x86util.asm b/libavutil/x86/x86util.asm
index cc7d272cad..d460ee5193 100644
--- a/libavutil/x86/x86util.asm
+++ b/libavutil/x86/x86util.asm
@@ -832,14 +832,25 @@
     pmaxsd  %1, %2
 %endmacro
 
-%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32
-%if cpuflag(avx)
-    vbroadcastss %1, %2
-%else ; sse
-%ifnidn %1, %2
-    movss        %1, %2
-%endif
-    shufps       %1, %1, 0
+%macro VBROADCASTSS 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vbroadcastss  %1, %2                    ; ymm, xmm
+%elif cpuflag(avx)
+    %ifnum sizeof%2         ; avx1 register
+        vpermilps  xmm%1, xmm%2, q0000      ; xmm, xmm, imm || ymm, ymm, imm
+        %if sizeof%1 >= 32  ; mmsize>=32
+            vinsertf128  %1, %1, xmm%1, 1   ; ymm, ymm, xmm, im
+        %endif
+    %else                   ; avx1 memory
+        vbroadcastss  %1, %2                ; ymm, mm32 || xmm, m32
+    %endif
+%else
+    %ifnum sizeof%2         ; sse register
+        shufps  %1, %2, %2, q0000
+    %else                   ; sse memory
+        movss   %1, %2
+        shufps  %1, %1, 0
+    %endif
 %endif
 %endmacro
 
@@ -854,6 +865,21 @@
 %endif
 %endmacro
 
+%macro VPBROADCASTD 2 ; dst xmm/ymm, src m32/xmm
+%if cpuflag(avx2)
+    vpbroadcastd  %1, %2
+%elif cpuflag(avx) && sizeof%1 >= 32
+    %error vpbroadcastd not possible with ymm on avx1. try vbroadcastss
+%else
+    %ifnum sizeof%2         ; sse2 register
+        pshufd  %1, %2, q0000
+    %else                   ; sse memory
+        movd    %1, %2
+        pshufd  %1, %1, 0
+    %endif
+%endif
+%endmacro
+
 %macro SHUFFLE_MASK_W 8
     %rep 8
         %if %1>=0x80
@@ -918,3 +944,69 @@
     movhlps    %1, %2        ; may cause an int/float domain transition and has a dependency on dst
 %endif
 %endmacro
+
+; Horizontal Sum of Packed Single precision floats
+; The resulting sum is in all elements.
+%macro HSUMPS 2 ; dst/src, tmp
+%if cpuflag(avx)
+    %if sizeof%1>=32  ; avx
+        vperm2f128  %2, %1, %1, (0)*16+(1)
+        addps       %1, %2
+    %endif
+    shufps      %2, %1, %1, q1032
+    addps       %1, %2
+    shufps      %2, %1, %1, q0321
+    addps       %1, %2
+%else  ; this form is a bit faster than the short avx-like emulation.
+    movaps      %2, %1
+    shufps      %1, %1, q1032
+    addps       %1, %2
+    movaps      %2, %1
+    shufps      %1, %1, q0321
+    addps       %1, %2
+    ; all %1 members should be equal for as long as float a+b==b+a
+%endif
+%endmacro
+
+; Emulate blendvps if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro BLENDVPS 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    blendvps  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %if notcpuflag(avx)
+        %ifnidn %3,xmm0
+            %error sse41 blendvps uses xmm0 as default 3d operand, you used %3
+        %endif
+    %endif
+    blendvps  %1, %2, %3
+%else
+    xorps  %2, %1
+    andps  %2, %3
+    xorps  %1, %2
+%endif
+%endmacro
+
+; Emulate pblendvb if not available
+;
+; src_b is destroyed when using emulation with logical operands
+; SSE41 blendv instruction is hard coded to use xmm0 as mask
+%macro PBLENDVB 3 ; dst/src_a, src_b, mask
+%if cpuflag(avx)
+    %if cpuflag(avx) && notcpuflag(avx2) && sizeof%1 >= 32
+        %error pblendb not possible with ymm on avx1, try blendvps.
+    %endif
+    pblendvb  %1, %1, %2, %3
+%elif cpuflag(sse4)
+    %ifnidn %3,xmm0
+        %error sse41 pblendvd uses xmm0 as default 3d operand, you used %3
+    %endif
+    pblendvb  %1, %2, %3
+%else
+    pxor  %2, %1
+    pand  %2, %3
+    pxor  %1, %2
+%endif
+%endmacro
-- 
2.13.2

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

[FFmpeg-devel] [PATCH] Add macros used in opus_pvq_search to x86util.asm

Reply via email to