Performance(Less is better) 8bit: ff_threshold8_sse4 32.7555351 ff_threshold8_avx2 32.1713562 ff_threshold8_avx512 32.0103531 16bit: ff_threshold16_sse4 37.7713432 ff_threshold16_avx2 35.3348312 ff_threshold16_avx512 32.6976166
Signed-off-by: Wu Jianhua <jianhua...@intel.com> --- libavfilter/x86/vf_threshold.asm | 44 +++++++++++++++++++++-------- libavfilter/x86/vf_threshold_init.c | 8 ++++++ 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_threshold.asm index 098069b083..dc4126c7af 100644 --- a/libavfilter/x86/vf_threshold.asm +++ b/libavfilter/x86/vf_threshold.asm @@ -29,6 +29,15 @@ pb_128_0 : times 8 db 0, 128 SECTION .text +%macro DECL_MASK 2 +%if mmsize < 64 + %xdefine %1 m%2 +%else + %assign %%i %2 + 1 + %xdefine %1 k %+ %%i +%endif +%endmacro + ;%1 depth (8 or 16) ; %2 b or w ; %3 constant %macro THRESHOLD 3 %if ARCH_X86_64 @@ -58,17 +67,24 @@ cglobal threshold%1, 5, 7, 5, in, threshold, min, max, out, w, x .nextrow: mov xq, wq - .loop: - movu m1, [inq + xq] - movu m0, [thresholdq + xq] - movu m2, [minq + xq] - movu m3, [maxq + xq] - pxor m0, m4 - pxor m1, m4 - pcmpgt%2 m0, m1 - PBLENDVB m3, m2, m0 - movu [outq + xq], m3 - add xq, mmsize +.loop: + movu m1, [inq + xq] + movu m0, [thresholdq + xq] + movu m2, [minq + xq] + movu m3, [maxq + xq] + pxor m0, m4 + pxor m1, m4 + DECL_MASK mask, 0 + pcmpgt%2 mask, m0, m1 + +%if mmsize == 64 + vpblendm%2 m3{mask}, m3, m2 +%else + PBLENDVB m3, m2, mask +%endif + + movu [outq + xq], m3 + add xq, mmsize jl .loop add inq, ilinesizeq @@ -90,3 +106,9 @@ INIT_YMM avx2 THRESHOLD 8, b, pb_128 THRESHOLD 16, w, pb_128_0 %endif + +%if HAVE_AVX512_EXTERNAL +INIT_ZMM avx512 +THRESHOLD 8, b, pb_128 +THRESHOLD 16, w, pb_128_0 +%endif diff --git a/libavfilter/x86/vf_threshold_init.c b/libavfilter/x86/vf_threshold_init.c index 8e42296791..0c75ea2870 100644 --- a/libavfilter/x86/vf_threshold_init.c +++ b/libavfilter/x86/vf_threshold_init.c @@ -34,8 +34,10 @@ void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\ THRESHOLD_FUNC(8, sse4) THRESHOLD_FUNC(8, avx2) +THRESHOLD_FUNC(8, avx512) THRESHOLD_FUNC(16, sse4) THRESHOLD_FUNC(16, avx2) +THRESHOLD_FUNC(16, avx512) av_cold void ff_threshold_init_x86(ThresholdContext *s) { @@ -48,6 +50,9 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->threshold = ff_threshold8_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->threshold = ff_threshold8_avx512; + } } else if (s->depth == 16) { if (EXTERNAL_SSE4(cpu_flags)) { s->threshold = ff_threshold16_sse4; @@ -55,5 +60,8 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->threshold = ff_threshold16_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->threshold = ff_threshold16_avx512; + } } } -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".