This is an automated email from the git hooks/post-receive script. Git pushed a commit to branch master in repository ffmpeg.
commit f0178068299aa74fb119179d6f0df926dbeba57f Author: Andreas Rheinhardt <[email protected]> AuthorDate: Sun Nov 23 23:29:24 2025 +0100 Commit: Andreas Rheinhardt <[email protected]> CommitDate: Thu Dec 4 15:17:37 2025 +0100 avcodec/x86/vp8dsp: Don't use MMX registers in ff_put_vp8_epel4_v6_ssse3 Switching to xmm registers allows to process two rows in parallel, leading to speedups. It is also ABI compliant (no more missing emms). Old benchmarks: vp8_put_epel4_v6_c: 132.8 ( 1.00x) vp8_put_epel4_v6_ssse3: 34.3 ( 3.87x) New benchmarks: vp8_put_epel4_v6_c: 131.5 ( 1.00x) vp8_put_epel4_v6_ssse3: 27.1 ( 4.86x) Reviewed-by: Ronald S. Bultje <[email protected]> Signed-off-by: Andreas Rheinhardt <[email protected]> --- libavcodec/x86/vp8dsp.asm | 48 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm index 7cb729a443..4778944ac7 100644 --- a/libavcodec/x86/vp8dsp.asm +++ b/libavcodec/x86/vp8dsp.asm @@ -162,6 +162,12 @@ SECTION .text ;------------------------------------------------------------------------------- %macro FILTER_SSSE3 1 +%if %1 == 4 +%define MOV movd +%else +%define MOV movq +%endif + cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg lea mxd, [mxq*3] mova m3, [filter_h6_shuf2] @@ -269,6 +275,7 @@ cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picr jg .nextrow RET +INIT_XMM ssse3 cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my lea myd, [myq*3] %if PIC @@ -279,14 +286,44 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr ; read 5 lines mov picregq, srcstrideq neg picregq - movh m0, [srcq+2*picregq] - movh m1, [srcq+picregq] - movh m2, [srcq] - movh m3, [srcq+srcstrideq] - movh m4, [srcq+2*srcstrideq] + MOV m0, [srcq+2*picregq] + MOV m1, [srcq+picregq] + MOV m2, [srcq] + MOV m3, [srcq+srcstrideq] + MOV m4, [srcq+2*srcstrideq] lea srcq, [srcq+srcstrideq*2] punpcklbw m0, m3 punpcklbw m1, m4 +%if %1 == 4 + punpcklqdq m0, m1 + +.next2rows: + movd m5, [srcq+srcstrideq] + movd m6, [srcq+2*srcstrideq] + pmaddubsw m0, [myq-48] + punpcklbw m2, m5 + punpcklqdq m1, m2 + pmaddubsw m1, [myq-32] + punpcklbw m3, m6 + punpcklqdq m2, m3 + paddw m0, m1 + pmaddubsw m1, m2, [myq-16] + lea srcq, [srcq+2*srcstrideq] + paddsw m1, m0 + mova m0, m2 + pmulhrsw m1, [pw_256] + mova m2, m4 + packuswb m1, m1 + movd [dstq], m1 + mova m4, m6 + psrldq m1, 4 + movd [dstq+dststrideq], m1 + lea dstq, [dstq+2*dststrideq] + mova m1, m3 + mova m3, m5 + sub heightd, 2 + jg .next2rows +%else .nextrow: movh m5, [srcq+srcstrideq] ; read new row @@ -310,6 +347,7 @@ cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picr add dstq, dststrideq dec heightd ; next row jg .nextrow +%endif RET %endmacro _______________________________________________ ffmpeg-cvslog mailing list -- [email protected] To unsubscribe send an email to [email protected]
