On 11/4/2021 1:18 AM, Wu Jianhua wrote:
Performance data(Less is better):
exposure_sse: 500491
You reported a better result in the first patch.
exposure_avx2: 449122
This looks like a really low speed up for a function that processes
twice the amount of floats per loop.
Signed-off-by: Wu Jianhua <jianhua...@intel.com>
---
libavfilter/x86/vf_exposure.asm | 15 +++++++++++++++
libavfilter/x86/vf_exposure_init.c | 6 ++++++
2 files changed, 21 insertions(+)
diff --git a/libavfilter/x86/vf_exposure.asm b/libavfilter/x86/vf_exposure.asm
index 3351c6fb3b..f271167805 100644
--- a/libavfilter/x86/vf_exposure.asm
+++ b/libavfilter/x86/vf_exposure.asm
@@ -36,11 +36,21 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
VBROADCASTSS m1, xmm1
%endif
+%if cpuflag(fma3) || cpuflag(fma4)
Remove the fma4 check if you're not using it.
+ mulps m0, m0, m1 ; black * scale
+%endif
+
.loop:
+%if cpuflag(fma3) || cpuflag(fma4)
+ mova m2, m0
+ vfmsub231ps m2, m1, [ptrq]
+ movu [ptrq], m2
Have you tried to not use FMA for this and just keep the sub + mul even
for AVX2 and see how it performs?
+%else
movu m2, [ptrq]
subps m2, m2, m0
mulps m2, m2, m1
movu [ptrq], m2
+%endif
add ptrq, mmsize
sub lengthq, mmsize/4
@@ -52,4 +62,9 @@ cglobal exposure, 2, 2, 4, ptr, length, black, scale
%if ARCH_X86_64
INIT_XMM sse
EXPOSURE
+
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+EXPOSURE
+%endif
%endif
diff --git a/libavfilter/x86/vf_exposure_init.c
b/libavfilter/x86/vf_exposure_init.c
index de1b360f6c..80dae6164e 100644
--- a/libavfilter/x86/vf_exposure_init.c
+++ b/libavfilter/x86/vf_exposure_init.c
@@ -24,6 +24,7 @@
#include "libavfilter/exposure.h"
void ff_exposure_sse(float *ptr, int length, float black, float scale);
+void ff_exposure_avx2(float *ptr, int length, float black, float scale);
av_cold void ff_exposure_init_x86(ExposureContext *s)
{
@@ -32,5 +33,10 @@ av_cold void ff_exposure_init_x86(ExposureContext *s)
#if ARCH_X86_64
if (EXTERNAL_SSE(cpu_flags))
s->exposure_func = ff_exposure_sse;
+
+#if HAVE_AVX2_EXTERNAL
No need for this preprocessor check.
+ if (EXTERNAL_AVX2_FAST(cpu_flags))
+ s->exposure_func = ff_exposure_avx2;
+#endif
#endif
}
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".