From: Xu Jun <xuju...@sjtu.edu.cn> Tested using the following command: ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5\ 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45\ :1/45:1:2:3:4:row:row:row:row" -an -vframes 1000 -f null /dev/null
The fps increases from 297 to 780 on my local mechine. Signed-off-by: Xu Jun <xuju...@sjtu.edu.cn> --- libavfilter/x86/vf_convolution.asm | 104 ++++++++++++++++++++++++++ libavfilter/x86/vf_convolution_init.c | 10 +++ 2 files changed, 114 insertions(+) mode change 100644 => 100755 libavfilter/x86/vf_convolution.asm diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm old mode 100644 new mode 100755 index 754d4d1064..b71e9720fb --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -154,3 +154,107 @@ cglobal filter_3x3, 4, 15, 7, dst, width, rdiv, bias, matrix, ptr, c0, c1, c2, c INIT_XMM sse4 FILTER_3X3 %endif + +; void filter_row_sse4(uint8_t *dst, int width, +; float rdiv, float bias, const int *const matrix, +; const uint8_t *c[], int peak, int radius, +; int dstride, int stride) +%if ARCH_X86_64 +INIT_XMM sse4 +%if UNIX64 +cglobal filter_row, 6, 10, 7, dst, width, matrix, ptr, mult, rad, r, x, i, ci +%else +cglobal filter_row, 4, 10, 7, dst, width, rdiv, bias, matrix, ptr, mult, rad, r, x, i, ci +%endif + +%if WIN64 + SWAP m0, m2 + SWAP m1, m3 + mov r2q, matrixmp + mov r3q, ptrmp + mov r5q, radmp + DEFINE_ARGS dst, width, matrix, ptr, mult, rad, r, x, i, ci +%endif + +movsxdifnidn radq, radd +sal radq, 1 +add radq, 1 ; 2*radius+1 +movsxdifnidn widthq, widthd +VBROADCASTSS m0, m0 +VBROADCASTSS m1, m1 +pxor m6, m6 +movss m5, [half] +VBROADCASTSS m5, m5 + +xor xq, xq +cmp widthq, mmsize/4 +jl .loop2 + +mov rq, widthq +and rq, mmsize/4-1 +sub widthq, rq + +.loop1: + pxor m4, m4 + xor iq, iq + .loop1_1: + movss m2, [matrixq + 4*iq] + VBROADCASTSS m2, m2 + mov ciq, [ptrq + iq * gprsize] + movss m3, [ciq + xq] + punpcklbw m3, m6 + punpcklwd m3, m6 + pmulld m2, m3 + paddd m4, m2 + + add iq, 1 + cmp iq, radq + jl .loop1_1 + + cvtdq2ps m4, m4 + mulps m4, m0 ; sum *= rdiv + addps m4, m1 ; sum += bias + addps m4, m5 ; sum += 0.5 + cvttps2dq m4, m4 + packssdw m4, m4 + packuswb m4, m4 + movss [dstq + xq], m4 + + add xq, mmsize/4 + cmp xq, widthq + jl .loop1 + + add widthq, rq + cmp xq, widthq + jge .end + +.loop2: + xor rd, rd + xor iq, iq + .loop2_2: + mov ciq, [ptrq + iq * gprsize] + movzx multd, byte [ciq + xq] + imul multd, [matrixq + 4*iq] + add rd, multd + + add iq, 1 + cmp iq, radq + jl .loop2_2 + + pxor m4, m4 + cvtsi2ss m4, rd + mulss m4, m0 ; sum *= rdiv + addss m4, m1 ; sum += bias + addss m4, m5 ; sum += 0.5 + cvttps2dq m4, m4 + packssdw m4, m4 + packuswb m4, m4 + movd rd, m4 + mov [dstq + xq], rb + + add xq, 1 + cmp xq, widthq + jl .loop2 +.end: + RET +%endif diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index 51432406ed..d1e8c90ceb 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -29,6 +29,12 @@ void ff_filter_3x3_sse4(uint8_t *dst, int width, const uint8_t *c[], int peak, int radius, int dstride, int stride); +void ff_filter_row_sse4(uint8_t *dst, int width, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int peak, int radius, + int dstride, int stride); + + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -41,6 +47,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) s->filter[i] = ff_filter_3x3_sse4; } } + if (s->mode[i] == MATRIX_ROW) { + if (EXTERNAL_SSE4(cpu_flags)) + s->filter[i] = ff_filter_row_sse4; + } } #endif } -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".