The AVX2 code leverages VPERMD to process 12 pixels/iteration. This is my first patch submission so any comments are greatly appreciated.
-Mike Tested on Skylake (Win32 & Win64) 1920x1080 input frame ===================== C code - 440 fps SSSE3 - 920 fps AVX - 930 fps AVX2 - 1040 fps Regression tested at 1920x1080, 1280x720, and 352x288 --- libavcodec/v210dec.c | 10 ++++- libavcodec/x86/v210-init.c | 8 ++++ libavcodec/x86/v210.asm | 83 +++++++++++++++++++++++++++++--------- 3 files changed, 81 insertions(+), 20 deletions(-) diff --git a/libavcodec/v210dec.c b/libavcodec/v210dec.c index ddc5dbe8be..166cec7a4a 100644 --- a/libavcodec/v210dec.c +++ b/libavcodec/v210dec.c @@ -119,7 +119,7 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, const uint32_t *src = (const uint32_t*)psrc; uint32_t val; - w = (avctx->width / 6) * 6; + w = (avctx->width / 12) * 12; s->unpack_frame(src, y, u, v, w); y += w; @@ -127,6 +127,14 @@ static int decode_frame(AVCodecContext *avctx, void *data, int *got_frame, v += w >> 1; src += (w << 1) / 3; + if (w < avctx->width - 5) { + READ_PIXELS(u, y, v); + READ_PIXELS(y, u, y); + READ_PIXELS(v, y, u); + READ_PIXELS(y, v, y); + w += 6; + } + if (w < avctx->width - 1) { READ_PIXELS(u, y, v); diff --git a/libavcodec/x86/v210-init.c b/libavcodec/x86/v210-init.c index d64dbca1a8..cb9a6cbd6a 100644 --- a/libavcodec/x86/v210-init.c +++ b/libavcodec/x86/v210-init.c @@ -21,9 +21,11 @@ extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_unaligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); +extern void ff_v210_planar_unpack_aligned_avx2(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); av_cold void ff_v210_x86_init(V210DecContext *s) { @@ -36,6 +38,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_aligned_avx; + + if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) + s->unpack_frame = ff_v210_planar_unpack_aligned_avx2; } else { if (cpu_flags & AV_CPU_FLAG_SSSE3) @@ -43,6 +48,9 @@ av_cold void ff_v210_x86_init(V210DecContext *s) if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; + + if (HAVE_AVX2_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX2) + s->unpack_frame = ff_v210_planar_unpack_unaligned_avx2; } #endif } diff --git a/libavcodec/x86/v210.asm b/libavcodec/x86/v210.asm index c24c765e5b..09449c93aa 100644 --- a/libavcodec/x86/v210.asm +++ b/libavcodec/x86/v210.asm @@ -22,52 +22,87 @@ %include "libavutil/x86/x86util.asm" -SECTION_RODATA +SECTION_RODATA 32 + +v210_mult: times 8 dw 64,4 +v210_mask: times 8 dd 0x3ff +v210_luma_shuf: times 2 db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 +v210_chroma_shuf1: times 2 db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 + +; for AVX2 version only +v210_luma_permute: dd 0,1,2,4,5,6,7,7 +v210_chroma_permute: dd 0,1,4,5,2,3,6,7 +v210_chroma_shuf2: times 2 db 0,1,2,3,4,5,8,9,10,11,12,13,-1,-1,-1,-1 -v210_mask: times 4 dd 0x3ff -v210_mult: dw 64,4,64,4,64,4,64,4 -v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 -v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 SECTION .text %macro v210_planar_unpack 1 ; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) -cglobal v210_planar_unpack_%1, 5, 5, 7 +cglobal v210_planar_unpack_%1, 5, 5, 10 movsxdifnidn r4, r4d lea r1, [r1+2*r4] add r2, r4 add r3, r4 neg r4 - mova m3, [v210_mult] - mova m4, [v210_mask] - mova m5, [v210_luma_shuf] - mova m6, [v210_chroma_shuf] + mova m3, [v210_luma_shuf] + mova m4, [v210_chroma_shuf1] + +%if cpuflag(avx2) + mova m5, [v210_luma_permute] ; VPERMD constant must be in a register + mova m6, [v210_chroma_permute] ; VPERMD constant must be in a register + mova m7, [v210_chroma_shuf2] +%endif + +%if ARCH_X86_64 + mova m8, [v210_mult] + mova m9, [v210_mask] +%endif + .loop: %ifidn %1, unaligned - movu m0, [r0] + movu m0, [r0] ; yB v5 yA u5 y9 v4 y8 u4 y7 v3 y6 u3 y5 v2 y4 u2 y3 v1 y2 u1 y1 v0 y0 u0 %else mova m0, [r0] %endif - pmullw m1, m0, m3 +%if ARCH_X86_64 + pmullw m1, m0, m8 + psrld m0, 10 ; __ v5 __ y9 __ u4 __ y6 __ v2 __ y3 __ u1 __ y0 + pand m0, m9 ; 00 v5 00 y9 00 u4 00 y6 00 v2 00 y3 00 u1 00 y0 +%else + pmullw m1, m0, [v210_mult] psrld m0, 10 - psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 - pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ + pand m0, [v210_mask] +%endif + + psrlw m1, 6 ; yB yA u5 v4 y8 y7 v3 u3 y5 y4 u2 v1 y2 y1 v0 u0 + shufps m2, m1, m0, 0x8d ; 00 y9 00 y6 yB yA y8 y7 00 y3 00 y0 y5 y4 y2 y1 + pshufb m2, m3 ; 00 00 yB yA y9 y8 y7 y6 00 00 y5 y4 y3 y2 y1 y0 + +%if cpuflag(avx2) + vpermd m2, m5, m2 ; 00 00 00 00 yB yA y9 y8 y7 y6 y5 y4 y3 y2 y1 y0 +%endif - shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ - pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ movu [r1+2*r4], m2 - shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ - pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ + shufps m1, m0, 0xd8 ; 00 v5 00 u4 u5 v4 v3 u3 00 v2 00 u1 u2 v1 v0 u0 + pshufb m1, m4 ; 00 v5 v4 v3 00 u5 u4 u3 00 v2 v1 v0 00 u2 u1 u0 + +%if cpuflag(avx2) + vpermd m1, m6, m1 ; 00 v5 v4 v3 00 v2 v1 v0 00 u5 u4 u3 00 u2 u1 u0 + pshufb m1, m7 ; 00 00 v5 v4 v3 v2 v1 v0 00 00 u5 u4 u3 u2 u1 u0 + movu [r2+r4], xm1 + vextracti128 [r3+r4], m1, 1 +%else movq [r2+r4], m1 movhps [r3+r4], m1 +%endif add r0, mmsize - add r4, 6 + add r4, (mmsize*3)/8 jl .loop REP_RET @@ -81,6 +116,11 @@ INIT_XMM avx v210_planar_unpack unaligned %endif +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +v210_planar_unpack unaligned +%endif + INIT_XMM ssse3 v210_planar_unpack aligned @@ -88,3 +128,8 @@ v210_planar_unpack aligned INIT_XMM avx v210_planar_unpack aligned %endif + +%if HAVE_AVX2_EXTERNAL +INIT_YMM avx2 +v210_planar_unpack aligned +%endif -- 2.20.1.windows.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel