ffmpeg | branch: master | Martin Vignali <martin.vign...@gmail.com> | Tue Dec 19 21:06:01 2017 +0100| [f181648176c0d93851d4a89410bbdd9c85e1fa7c] | committer: Martin Vignali
avfilter/x86/vf_hflip : add avx2 version for hflip_byte and hflip_short > http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f181648176c0d93851d4a89410bbdd9c85e1fa7c --- libavfilter/x86/vf_hflip.asm | 12 +++++++++++- libavfilter/x86/vf_hflip_init.c | 20 ++++++++++++++++---- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm index 82e1154d21..6bd1782da4 100644 --- a/libavfilter/x86/vf_hflip.asm +++ b/libavfilter/x86/vf_hflip.asm @@ -32,7 +32,7 @@ SECTION .text ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short) %macro HFLIP 3 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x - mova m0, [pb_flip_%1] + VBROADCASTI128 m0, [pb_flip_%1] xor xq, xq %if %3 == 1 movsxdifnidn wq, wd @@ -47,8 +47,13 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x .loop0: neg xq +%if mmsize == 32 + vpermq m1, [srcq + xq - mmsize + %3], 0x4e; flip each lane at load + vpermq m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load +%else movu m1, [srcq + xq - mmsize + %3] movu m2, [srcq + xq - 2 * mmsize + %3] +%endif pshufb m1, m0 pshufb m2, m0 neg xq @@ -78,3 +83,8 @@ INIT_XMM ssse3 HFLIP byte, b, 1 HFLIP short, w, 2 +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +HFLIP byte, b, 1 +HFLIP short, w, 2 +%endif diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c index 2b5c9d3bf3..0ac399b0d4 100644 --- a/libavfilter/x86/vf_hflip_init.c +++ b/libavfilter/x86/vf_hflip_init.c @@ -24,7 +24,9 @@ #include "libavfilter/hflip.h" void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w); void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w); +void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w); av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) { @@ -32,10 +34,20 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes) int i; for (i = 0; i < nb_planes; i++) { - if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) { - s->flip_line[i] = ff_hflip_byte_ssse3; - } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) { - s->flip_line[i] = ff_hflip_short_ssse3; + if (step[i] == 1) { + if (EXTERNAL_SSSE3(cpu_flags)) { + s->flip_line[i] = ff_hflip_byte_ssse3; + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + s->flip_line[i] = ff_hflip_byte_avx2; + } + } else if (step[i] == 2) { + if (EXTERNAL_SSSE3(cpu_flags)) { + s->flip_line[i] = ff_hflip_short_ssse3; + } + if (EXTERNAL_AVX2_FAST(cpu_flags)) { + s->flip_line[i] = ff_hflip_short_avx2; + } } } } _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog