From: Xu Jun <xuju...@sjtu.edu.cn> Performance improves about 10% compared to v1.
Tested using this command: ./ffmpeg_g -s 1280*720 -pix_fmt yuv420p -i test.yuv -vf convolution="1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1 2 3 4 5 6 7 8 9:1/45:1/45:1/45:1/45:1:2:3:4:column:column:column:column" -an -vframes 5000 -f null /dev/null -benchmark after patch: frame= 4317 fps=600 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed= 24x video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown bench: utime=21.540s stime=2.091s rtime=7.197s before patch: frame= 4317 fps=263 q=-0.0 Lsize=N/A time=00:02:52.68 bitrate=N/A speed=10.5x video:2260kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: unknown bench: utime=74.377s stime=1.880s rtime=16.420s Signed-off-by: Xu Jun <xuju...@sjtu.edu.cn> --- libavfilter/x86/vf_convolution.asm | 202 ++++++++++++++++++++++++++ libavfilter/x86/vf_convolution_init.c | 9 ++ 2 files changed, 211 insertions(+) diff --git a/libavfilter/x86/vf_convolution.asm b/libavfilter/x86/vf_convolution.asm index 2a09374b00..4c700656d6 100755 --- a/libavfilter/x86/vf_convolution.asm +++ b/libavfilter/x86/vf_convolution.asm @@ -22,6 +22,8 @@ SECTION_RODATA half: dd 0.5 +shuf_init: ddq 0x80808003808080028080800180808000 +shuf_step: ddq 0x00000004000000040000000400000004 SECTION .text @@ -285,3 +287,203 @@ sub widthq, rq .end: RET %endif + +; void filter_column(uint8_t *dst, int height, +; float rdiv, float bias, const int *const matrix, +; const uint8_t *c[], int length, int radius, +; int dstride, int stride); + +%macro COMPUTE_4COL 1 + pshufb m7, m6, m4 ; get 4 uint8s from the 16 uint8s + pmulld m7, m5 + paddd m1%1, m7 +%endmacro + +%macro CVT_PACK_COL 1 + cvtdq2ps m1%1, m1%1 + mulps m1%1, m0 ; sum *= rdiv + addps m1%1, m1 ; sum += bias + addps m1%1, m3 ; sum += 0.5 + cvttps2dq m1%1, m1%1 + packssdw m1%1, m1%1 + packuswb m1%1, m1%1 +%endmacro + +%if ARCH_X86_64 +INIT_XMM sse4 +%if UNIX64 +cglobal filter_column, 8, 14, 14, dst, height, matrix, ptr, width, rad, dstride, stride, \ +i, ci, ystride, sum, r, off16 +%else +cglobal filter_column, 8, 14, 14, dst, height, rdiv, bias, matrix, ptr, width, rad, dstride, stride, \ +i, ci, ystride, sum, r, off16 +%endif + +%if WIN64 + SWAP m0, m2 + SWAP m1, m3 + mov r2q, matrixmp + mov r3q, ptrmp + mov r4q, widthmp + mov r5q, radmp + mov r6q, dstridemp + mov r7q, stridemp + DEFINE_ARGS dst, height, matrix, ptr, width, rad, dstride, stride, \ + i, ci, ystride, sum, r, off16 +%endif + +movsxdifnidn widthq, widthd +movsxdifnidn radq, radd +lea radq, [radq * 2 + 1] +movsxdifnidn dstrideq, dstrided +movsxdifnidn strideq, strided +movsxdifnidn heightq, heightd + +VBROADCASTSS m0, m0 ; rdiv +VBROADCASTSS m1, m1 ; bias +pxor m2, m2 ; zero +movss m3, [half] +VBROADCASTSS m3, m3 ; 0.5 +movdqu m8, [shuf_init] ; shuffle initialization +movdqu m9, [shuf_step] ; shuffle step + +xor ystrideq, ystrideq ; y*stride + +cmp widthq, mmsize ;if width<16 run loopr, width=16 run 16 parallel +jl .less16 + +.equal16: + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + pxor m13, m13 + ; m10-13 hold sums + + lea iq, [radq - 1] + .loopi: + movd m5, [matrixq + 4*iq] ; matrix[i] + VBROADCASTSS m5, m5 + mov ciq, [ptrq + iq * gprsize] + movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s + + ;m4 controls shuffle + movdqa m4, m8 + COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 + paddd m4, m9 + COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 + paddd m4, m9 + COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 + paddd m4, m9 + COMPUTE_4COL 3 ; process 12-15 cols, sum in m13 + + sub iq, 1 + jns .loopi + + CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit + CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit + CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit + CVT_PACK_COL 3 ; process 12-15 cols, result in m13's low 32bit + punpckldq m10, m11 + punpckldq m12, m13 + punpcklqdq m10, m12 ; pack 16 results in m10 + movdqu [dstq], m10 + + add dstq, dstrideq + add ystrideq, strideq + sub heightq, 1 + jnz .equal16 + jmp .end + +.less16: + xor off16q, off16q + cmp widthq, mmsize/4 + jl .loopr + + mov rq, widthq + and rq, mmsize/4-1 + sub widthq, rq + + pxor m10, m10 + pxor m11, m11 + pxor m12, m12 + + lea iq, [radq - 1] + .loopi_4: + movd m5, [matrixq + 4*iq] ; matrix[i] + VBROADCASTSS m5, m5 + mov ciq, [ptrq + iq * gprsize] + movdqu m6, [ciq + ystrideq] ; c[i][y*stride] 16 uint8s + + ;m4 controls shuffle + movdqa m4, m8 + COMPUTE_4COL 0 ; process 0-3 cols, sum in m10 + cmp widthq, mmsize/4 ; width = 4 + je .i4_end + + paddd m4, m9 + COMPUTE_4COL 1 ; process 4-7 cols, sum in m11 + cmp widthq, mmsize/2 ; width = 8 + je .i4_end + + paddd m4, m9 + COMPUTE_4COL 2 ; process 8-11 cols, sum in m12 + + .i4_end: + sub iq, 1 + jns .loopi_4 + + CVT_PACK_COL 0 ; process 0-3 cols, result in m10's low 32bit + movd [dstq], m10 + cmp widthq, mmsize/4 ; width = 4 + je .cvt_end + + CVT_PACK_COL 1 ; process 4-7 cols, result in m11's low 32bit + movd [dstq + mmsize/4], m11 + cmp widthq, mmsize/2 ; width = 8 + je .cvt_end + + CVT_PACK_COL 2 ; process 8-11 cols, result in m12's low 32bit + movd [dstq + mmsize/2], m12 + + .cvt_end: + cmp rq, 0 + je .loopr_end + mov off16q, widthq + add widthq, rq + + .loopr: + xor sumq, sumq + lea iq, [radq - 1] + .loopr_i: + mov ciq, [ptrq + iq * gprsize] + add ciq, ystrideq + movzx rd, byte [ciq + off16q] + imul rd, [matrixq + 4*iq] + add sumd, rd + + sub iq, 1 + jns .loopr_i + + pxor m7, m7 + cvtsi2ss m7, sumd + mulss m7, m0 ; sum *= rdiv + addss m7, m1 ; sum += bias + addss m7, m3 ; sum += 0.5 + cvttps2dq m7, m7 + packssdw m7, m7 + packuswb m7, m7 + movd sumd, m7 + mov [dstq + off16q], sumb + add off16q, 1 + cmp off16q, widthq + jl .loopr + + .loopr_end: + add dstq, dstrideq + add ystrideq, strideq + sub heightq, 1 + jnz .less16 + +.end: + RET +%endif diff --git a/libavfilter/x86/vf_convolution_init.c b/libavfilter/x86/vf_convolution_init.c index 5eb3b3bee1..da39b8a400 100644 --- a/libavfilter/x86/vf_convolution_init.c +++ b/libavfilter/x86/vf_convolution_init.c @@ -34,6 +34,11 @@ void ff_filter_row_sse4(uint8_t *dst, int width, const uint8_t *c[], int peak, int radius, int dstride, int stride); +void ff_filter_column_sse4(uint8_t *dst, int height, + float rdiv, float bias, const int *const matrix, + const uint8_t *c[], int length, int radius, + int dstride, int stride); + av_cold void ff_convolution_init_x86(ConvolutionContext *s) { #if ARCH_X86_64 @@ -50,6 +55,10 @@ av_cold void ff_convolution_init_x86(ConvolutionContext *s) if (EXTERNAL_SSE4(cpu_flags)) s->filter[i] = ff_filter_row_sse4; } + if (s->mode[i] == MATRIX_COLUMN) { + if (EXTERNAL_SSE4(cpu_flags)) + s->filter[i] = ff_filter_column_sse4; + } } #endif } -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".