On 2/13/2021 8:10 AM, Paul B Mahol wrote:
Signed-off-by: Paul B Mahol <one...@gmail.com>
---
  libavfilter/x86/vf_gblur.asm    | 46 +++++++++++++++++++++++++++++++++
  libavfilter/x86/vf_gblur_init.c | 11 ++++++--
  2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index a25b1659f5..8fea6d2a61 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -183,3 +183,49 @@ HORIZ_SLICE
  INIT_XMM avx2
  HORIZ_SLICE
  %endif
+
+%macro POSTSCALE_SLICE 0
+%if UNIX64
+cglobal postscale_slice, 2, 6, 4, ptr, length, postscale, min, max, x

cglobal postscale_slice, 2, 3, 4, ptr, length, x

+%else
+cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
+%endif
+    shl lengthd, 2
+%if WIN64
+    SWAP 0, 2
+    SWAP 1, 3
+    SWAP 2, 4
+%endif
+    shufps   xm0, xm0, 0
+    shufps   xm1, xm1, 0
+    shufps   xm2, xm2, 0
+%if cpuflag(avx2)
+    vinsertf128  m0, m0, xm0, 1
+    vinsertf128  m1, m1, xm1, 1
+    vinsertf128  m2, m2, xm2, 1

You can use vbroadcastss ymm, xmm with AVX2, which combines both the shufps and vinsertf128 into one instruction.

As is, this function is base AVX. So if you can't measure any performance gain with vbroadcastss, then just mark the function as AVX.

+%endif
+    xor      xq, xq
+
+    .loop:
+    movu          m3, [ptrq + xq]
+    mulps         m3, m0

AVX can use unaligned memory operands, so just do

mulps m3, m0, [ptrq + xq]

But keep the explicit movu + mulps for the SSE version, otherwise x86inc will expand it into a mova.

+    maxps         m3, m1
+    minps         m3, m2
+    movu   [ptrq+xq], m3
+
+    add xq, mmsize
+    cmp xd, lengthd

Can't you use the neg trick? It should let you reuse length instead of x.

+    jl .loop
+
+    RET
+%endmacro
+
+%if ARCH_X86_64

Nothing in this function seems to require x86_64.

+INIT_XMM sse4

No instruction is SSE4 here. It's all base SSE.

+POSTSCALE_SLICE
+
+%if HAVE_AVX_EXTERNAL

Wrong check.

+INIT_YMM avx2
+POSTSCALE_SLICE
+%endif
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index e63e59fe23..7a9b40b0ad 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -27,14 +27,21 @@
  void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float 
nu, float bscale);
  void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float 
nu, float bscale);
+void ff_postscale_slice_sse4(float *ptr, int length, float postscale, float min, float max);
+void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float 
min, float max);
+
  av_cold void ff_gblur_init_x86(GBlurContext *s)
  {
  #if ARCH_X86_64
      int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_SSE4(cpu_flags))
+    if (EXTERNAL_SSE4(cpu_flags)) {
          s->horiz_slice = ff_horiz_slice_sse4;
-    if (EXTERNAL_AVX2(cpu_flags))
+        s->postscale_slice = ff_postscale_slice_sse4;
+    }
+    if (EXTERNAL_AVX2(cpu_flags)) {
          s->horiz_slice = ff_horiz_slice_avx2;
+        s->postscale_slice = ff_postscale_slice_avx2;

Needs to be EXTERNAL_AVX2_FAST. You're using ymm regs, unlike in ff_horiz_slice_avx2.

+    }
  #endif
  }


_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to