On 2/14/2021 11:32 AM, Paul B Mahol wrote:
Signed-off-by: Paul B Mahol <one...@gmail.com>
---
libavfilter/x86/vf_gblur.asm | 49 +++++++++++++++++++++++++++++++++
libavfilter/x86/vf_gblur_init.c | 17 ++++++++++--
2 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/libavfilter/x86/vf_gblur.asm b/libavfilter/x86/vf_gblur.asm
index a25b1659f5..8ccfbdc56b 100644
--- a/libavfilter/x86/vf_gblur.asm
+++ b/libavfilter/x86/vf_gblur.asm
@@ -183,3 +183,52 @@ HORIZ_SLICE
INIT_XMM avx2
HORIZ_SLICE
%endif
+
+%macro POSTSCALE_SLICE 0
+%if UNIX64
+cglobal postscale_slice, 2, 3, 4, ptr, length, x
2, 2, 4, ptr, length
+%else
+cglobal postscale_slice, 5, 6, 4, ptr, length, postscale, min, max, x
5, 5, 4, ptr, length, postscale, min, max
+%endif
+ shl lengthd, 2
shl lengthd, 2
add ptrq, lengthq
neg lengthq
+%if WIN64
+ SWAP 0, 2
+ SWAP 1, 3
+ SWAP 2, 4
+%endif
+%if cpuflag(avx2)
+ vbroadcastss m0, xm0
+ vbroadcastss m1, xm1
+ vbroadcastss m2, xm2
+%else
+ shufps xm0, xm0, 0
+ shufps xm1, xm1, 0
+ shufps xm2, xm2, 0
+%endif
+ xor xq, xq
remove this instruction.
+
+ .loop:
+%if cpuflag(avx2)
+ mulps m3, m0, [ptrq + xq]
Replace xq with lengthq here and everywhere else.
+%else
+ movu m3, [ptrq + xq]
+ mulps m3, m0
+%endif
+ maxps m3, m1
+ minps m3, m2
+ movu [ptrq+xq], m3
+
+ add xq, mmsize
+ cmp xd, lengthd
remove this cmp.
+ jl .loop
+
+ RET
+%endmacro
+
+INIT_XMM sse
+POSTSCALE_SLICE
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+POSTSCALE_SLICE
+%endif
diff --git a/libavfilter/x86/vf_gblur_init.c b/libavfilter/x86/vf_gblur_init.c
index e63e59fe23..9223cb797d 100644
--- a/libavfilter/x86/vf_gblur_init.c
+++ b/libavfilter/x86/vf_gblur_init.c
@@ -27,14 +27,25 @@
void ff_horiz_slice_sse4(float *ptr, int width, int height, int steps, float
nu, float bscale);
void ff_horiz_slice_avx2(float *ptr, int width, int height, int steps, float
nu, float bscale);
+void ff_postscale_slice_sse(float *ptr, int length, float postscale, float min, float max);
+void ff_postscale_slice_avx2(float *ptr, int length, float postscale, float
min, float max);
+
av_cold void ff_gblur_init_x86(GBlurContext *s)
{
-#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
- if (EXTERNAL_SSE4(cpu_flags))
+ if (EXTERNAL_SSE(cpu_flags)) {
+ s->postscale_slice = ff_postscale_slice_sse;
+ }
+ if (EXTERNAL_AVX2(cpu_flags)) {
EXTERNAL_AVX2_FAST
+ s->postscale_slice = ff_postscale_slice_avx2;
+ }
+#if ARCH_X86_64
+ if (EXTERNAL_SSE4(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_sse4;
- if (EXTERNAL_AVX2(cpu_flags))
+ }
+ if (EXTERNAL_AVX2(cpu_flags)) {
s->horiz_slice = ff_horiz_slice_avx2;
+ }
#endif
}
LGTM with the above.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".