Performance(Less is better): 8bit: ff_hflip_byte_ssse3 0.61 ff_hflip_byte_avx2 0.37 ff_hflip_byte_avx512 0.19 16bit: ff_hflip_short_ssse3 1.27 ff_hflip_short_avx2 0.76 ff_hflip_short_avx512 0.40
Signed-off-by: Wu Jianhua <jianhua...@intel.com> --- libavfilter/x86/vf_hflip.asm | 23 ++++++++++++++++++----- libavfilter/x86/vf_hflip_init.c | 8 ++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index 285618954f..c2237217f7 100644 --- a/libavfilter/x86/vf_hflip.asm +++ b/libavfilter/x86/vf_hflip.asm @@ -26,12 +26,16 @@ SECTION_RODATA pb_flip_byte: db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 pb_flip_short: db 14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1 +pd_flip_indicies: dd 12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3 SECTION .text ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) %macro HFLIP 3 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x +%if mmsize == 64 + movu m3, [pd_flip_indicies] +%endif VBROADCASTI128 m0, [pb_flip_%1] xor xq, xq %if %3 == 1 @@ -47,12 +51,15 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x .loop0: neg xq -%if mmsize == 32 - vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load - vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load +%if mmsize == 64 + vpermd m1, m3, [srcq + xq - mmsize + %3] + vpermd m2, m3, [srcq + xq - 2 * mmsize + %3] +%elif mmsize == 32 + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load %else - movu m1, [srcq + xq - mmsize + %3] - movu m2, [srcq + xq - 2 * mmsize + %3] + movu m1, [srcq + xq - mmsize + %3] + movu m2, [srcq + xq - 2 * mmsize + %3] %endif pshufb m1, m0 pshufb m2, m0 @@ -88,3 +95,9 @@ INIT_YMM avx2 HFLIP byte, b, 1 HFLIP short, w, 2 %endif + +%if HAVE_AVX512_EXTERNAL +INIT_ZMM avx512 +HFLIP byte, b, 1 +HFLIP short, w, 2 +%endif diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index 0ac399b0d4..25fc40f7b0 100644 --- a/libavfilter/x86/vf_hflip_init.c +++ b/libavfilter/x86/vf_hflip_init.c @@ -25,8 +25,10 @@ void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_byte_avx512(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_short_avx512(const uint8_t *src, uint8_t *dst, int w); av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) { @@ -41,6 +43,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->flip_line[i] = ff_hflip_byte_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->flip_line[i] = ff_hflip_byte_avx512; + } } else if (step[i] == 2) { if (EXTERNAL_SSSE3(cpu_flags)) { s->flip_line[i] = ff_hflip_short_ssse3; @@ -48,6 +53,9 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) if (EXTERNAL_AVX2_FAST(cpu_flags)) { s->flip_line[i] = ff_hflip_short_avx2; } + if (EXTERNAL_AVX512(cpu_flags)) { + s->flip_line[i] = ff_hflip_short_avx512; + } } } } -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".