Zen 2 (Ryzen 7 3700X): 1.73x faster (3603±586.3 vs. 2082±317.1 decicycles) compared with ssse3
Using an SD y4m file speed increases from ~ 3600 fps to ~4700. --- libavfilter/x86/vf_yadif.asm | 83 +++++++++++++++++++++++---------- libavfilter/x86/vf_yadif_init.c | 4 ++ 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/libavfilter/x86/vf_yadif.asm b/libavfilter/x86/vf_yadif.asm index 809cebdd3f..571febfca3 100644 --- a/libavfilter/x86/vf_yadif.asm +++ b/libavfilter/x86/vf_yadif.asm @@ -25,11 +25,30 @@ SECTION_RODATA -pb_1: times 16 db 1 -pw_1: times 8 dw 1 +pb_1: times 32 db 1 +pw_1: times 16 dw 1 SECTION .text +%unmacro RSHIFT 2 + +%macro RSHIFT 2 +%if mmsize == 32 + vextracti128 xm7, %1, 1 + palignr xmm %+ %1, xm7, xmm %+ %1, 2 +%else + psrldq %1, %2 +%endif +%endmacro + +%macro UNPACK 1 +%if mmsize == 32 + pmovzxbw %1, xmm %+ %1 +%else + punpcklbw %1, m7 +%endif +%endmacro + %macro CHECK 2 movu m2, [curq+t1+%1] movu m3, [curq+t0+%2] @@ -40,7 +59,7 @@ SECTION .text pand m4, [pb_1] psubusb m5, m4 RSHIFT m5, 1 - punpcklbw m5, m7 + UNPACK m5 mova m4, m2 psubusb m2, m3 psubusb m3, m4 @@ -49,9 +68,9 @@ SECTION .text mova m4, m2 RSHIFT m3, 1 RSHIFT m4, 2 - punpcklbw m2, m7 - punpcklbw m3, m7 - punpcklbw m4, m7 + UNPACK m2 + UNPACK m3 + UNPACK m4 paddw m2, m3 paddw m2, m4 %endmacro @@ -81,13 +100,19 @@ SECTION .text %endmacro %macro LOAD 2 - movh %1, %2 - punpcklbw %1, m7 + %if mmsize == 32 + pmovzxbw %1, %2 + %else + movh %1, %2 + punpcklbw %1, m7 + %endif %endmacro %macro FILTER 3 .loop%1: - pxor m7, m7 + %if mmsize != 32 + pxor m7, m7 + %endif LOAD m0, [curq+t1] LOAD m1, [curq+t0] LOAD m2, [%2] @@ -95,9 +120,9 @@ SECTION .text mova m4, m3 paddw m3, m2 psraw m3, 1 - mova [rsp+ 0], m0 - mova [rsp+16], m3 - mova [rsp+32], m1 + mova [rsp+0*mmsize], m0 + mova [rsp+1*mmsize], m3 + mova [rsp+2*mmsize], m1 psubw m2, m4 ABS1 m2, m4 LOAD m3, [prevq+t1] @@ -119,7 +144,7 @@ SECTION .text paddw m3, m4 psrlw m3, 1 pmaxsw m2, m3 - mova [rsp+48], m2 + mova [rsp+3*mmsize], m2 paddw m1, m0 paddw m0, m0 @@ -134,9 +159,9 @@ SECTION .text psubusb m3, m4 pmaxub m2, m3 mova m3, m2 - psrldq m3, 2 - punpcklbw m2, m7 - punpcklbw m3, m7 + RSHIFT m3, 2 + UNPACK m2 + UNPACK m3 paddw m0, m2 paddw m0, m3 psubw m0, [pw_1] @@ -150,7 +175,7 @@ SECTION .text CHECK 1, -3 CHECK2 - mova m6, [rsp+48] + mova m6, [rsp+3*mmsize] cmp DWORD r8m, 2 jge .end%1 LOAD m2, [%2+t1*2] @@ -161,9 +186,9 @@ SECTION .text paddw m3, m5 psrlw m2, 1 psrlw m3, 1 - mova m4, [rsp+ 0] - mova m5, [rsp+16] - mova m7, [rsp+32] + mova m4, [rsp+0*mmsize] + mova m5, [rsp+1*mmsize] + mova m7, [rsp+2*mmsize] psubw m2, m4 psubw m3, m7 mova m0, m5 @@ -182,15 +207,21 @@ SECTION .text pmaxsw m6, m4 .end%1: - mova m2, [rsp+16] + mova m2, [rsp+1*mmsize] mova m3, m2 psubw m2, m6 paddw m3, m6 pmaxsw m1, m2 pminsw m1, m3 - packuswb m1, m1 - movh [dstq], m1 + %if mmsize == 32 + vextracti128 xm4, ym1, 1 + packuswb xm1, xm4 + movu [dstq], xm1 + %else + packuswb m1, m1 + movh [dstq], m1 + %endif add dstq, mmsize/2 add prevq, mmsize/2 add curq, mmsize/2 @@ -201,10 +232,10 @@ SECTION .text %macro YADIF 0 %if ARCH_X86_32 -cglobal yadif_filter_line, 4, 6, 8, 80, dst, prev, cur, next, w, prefs, \ +cglobal yadif_filter_line, 4, 6, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \ mrefs, parity, mode %else -cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \ +cglobal yadif_filter_line, 4, 7, 8, 4*mmsize, dst, prev, cur, next, w, prefs, \ mrefs, parity, mode %endif %if ARCH_X86_32 @@ -233,3 +264,5 @@ INIT_XMM ssse3 YADIF INIT_XMM sse2 YADIF +INIT_YMM avx2 +YADIF diff --git a/libavfilter/x86/vf_yadif_init.c b/libavfilter/x86/vf_yadif_init.c index d648f0f835..48858dc295 100644 --- a/libavfilter/x86/vf_yadif_init.c +++ b/libavfilter/x86/vf_yadif_init.c @@ -29,6 +29,8 @@ void ff_yadif_filter_line_sse2(void *dst, void *prev, void *cur, void ff_yadif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next, int w, int prefs, int mrefs, int parity, int mode); +void ff_yadif_filter_line_avx2(void *dst, void *prev, void *cur, void *next, + int w, int prefs, int mrefs, int parity, int mode); void ff_yadif_filter_line_16bit_sse2(void *dst, void *prev, void *cur, void *next, int w, int prefs, @@ -68,5 +70,7 @@ av_cold void ff_yadif_init_x86(YADIFContext *yadif, int bit_depth) yadif->filter_line = ff_yadif_filter_line_sse2; if (EXTERNAL_SSSE3(cpu_flags)) yadif->filter_line = ff_yadif_filter_line_ssse3; + if (EXTERNAL_AVX2(cpu_flags)) + yadif->filter_line = ff_yadif_filter_line_avx2; } } -- 2.39.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".