On Thu, Jan 14, 2016 at 6:54 PM, Ganesh Ajjanagadde <gajja...@mit.edu> wrote: > On Thu, Jan 14, 2016 at 6:42 PM, James Almer <jamr...@gmail.com> wrote: >> On 1/14/2016 7:46 PM, Ganesh Ajjanagadde wrote: >>> This improves accuracy (very slightly) and speed for processors having >>> fma3. >>> >>> Sample benchmark (fate flac-16-lpc-cholesky, Haswell): >>> old: >>> 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips >>> 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips >>> >>> new: >>> 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips >>> 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips >>> >>> Tested with FATE and --disable-fma3, also examined contents of >>> lavu/lls-test. >>> >>> Reviewed-by: James Almer <jamr...@gmail.com> >>> Reviewed-by: Henrik Gramner <hen...@gramner.com> >>> Signed-off-by: Ganesh Ajjanagadde <gajjanaga...@gmail.com> >>> --- >>> libavutil/x86/lls.asm | 61 >>> ++++++++++++++++++++++++++++++++++++++++++++++-- >>> libavutil/x86/lls_init.c | 4 ++++ >>> 2 files changed, 63 insertions(+), 2 deletions(-) >>> >>> diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm >>> index 769befb..925cbdb 100644 >>> --- a/libavutil/x86/lls.asm >>> +++ b/libavutil/x86/lls.asm >>> @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 >>> .ret: >>> REP_RET >>> >>> -%if HAVE_AVX_EXTERNAL >>> -INIT_YMM avx >>> +%macro UPDATE_LLS 0 >>> cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 >>> %define covarq ctxq >>> mov countd, [ctxq + LLSModel.indep_count] >>> @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, >>> count2 >>> vbroadcastsd ymm6, [varq + iq*8 + 16] >>> vbroadcastsd ymm7, [varq + iq*8 + 24] >>> vextractf128 xmm3, ymm1, 1 >>> +%if cpuflag(fma3) >>> + mova ymm0, COVAR(iq ,0) >>> + mova xmm2, COVAR(iq+2,2) >>> + fmaddpd ymm0, ymm1, ymm4, ymm0 >>> + fmaddpd xmm2, xmm3, xmm6, xmm2 >>> + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) >>> + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) >>> + mova COVAR(iq ,0), ymm0 >>> + mova COVAR(iq ,1), ymm1 >>> + mova COVAR(iq+2,2), xmm2 >>> + mova COVAR(iq+2,3), xmm3 >>> +%else >>> vmulpd ymm0, ymm1, ymm4 >>> vmulpd ymm1, ymm1, ymm5 >>> vmulpd xmm2, xmm3, xmm6 >>> @@ -148,12 +159,27 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, >>> count2 >>> ADDPD_MEM COVAR(iq ,1), ymm1 >>> ADDPD_MEM COVAR(iq+2,2), xmm2 >>> ADDPD_MEM COVAR(iq+2,3), xmm3 >>> +%endif ; cpuflag(fma3) >>> lea jd, [iq + 4] >>> cmp jd, count2d >>> jg .skip4x4 >>> .loop4x4: >>> ; Compute all 16 pairwise products of a 4x4 block >>> mova ymm3, [varq + jq*8] >>> +%if cpuflag(fma3) >>> + mova ymm0, COVAR(jq, 0) >>> + mova ymm1, COVAR(jq, 1) >>> + mova ymm2, COVAR(jq, 2) >>> + mova ymm3, COVAR(jq, 3) >> >> This is wrong. You're overwriting the contents of ymm3. libavutil/lls-test >> didn't reflect >> this at all? >> >>> + fmaddpd ymm0, ymm3, ymm4, ymm0 >>> + fmaddpd ymm1, ymm3, ymm5, ymm1 >>> + fmaddpd ymm2, ymm3, ymm6, ymm2 >>> + fmaddpd ymm3, ymm3, ymm7, ymm3 >>> + mova COVAR(jq, 0), ymm0 >>> + mova COVAR(jq, 1), ymm1 >>> + mova COVAR(jq, 2), ymm2 >>> + mova COVAR(jq, 3), ymm3 >>> +%else >>> vmulpd ymm0, ymm3, ymm4 >>> vmulpd ymm1, ymm3, ymm5 >>> vmulpd ymm2, ymm3, ymm6 >>> @@ -162,6 +188,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 >>> ADDPD_MEM COVAR(jq,1), ymm1 >>> ADDPD_MEM COVAR(jq,2), ymm2 >>> ADDPD_MEM COVAR(jq,3), ymm3 >>> +%endif ; cpuflag(fma3) >>> add jd, 4 >>> cmp jd, count2d >>> jle .loop4x4 >>> @@ -169,6 +196,20 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, >>> count2 >>> cmp jd, countd >>> jg .skip2x4 >>> mova xmm3, [varq + jq*8] >>> +%if cpuflag(fma3) >>> + mova xmm0, COVAR(jq, 0) >>> + mova xmm1, COVAR(jq, 1) >>> + mova xmm2, COVAR(jq, 2) >>> + mova xmm3, COVAR(jq, 3) >> >> Same here. > > Diff /tmp/lls.old /tmp/lls.new: > < real:-0.326534 order:0 pred:-0.588792 var:0.272632 coeffs:1.427836 > -0.385559 0.000000 > < real:-0.326534 order:1 pred:-0.326534 var:-nan coeffs:3.274787 > -1.436430 0.000000 > < real:-0.326534 order:2 pred:-0.326534 var:0.000000 coeffs:3.274787 > -1.436430 0.000000 > --- >> real:-0.326534 order:0 pred:-0.588792 var:0.272632 coeffs:1.427836 -0.385559 >> -0.000000 >> real:-0.326534 order:1 pred:-0.326534 var:0.000000 coeffs:3.274787 -1.436430 >> 0.000000 >> real:-0.326534 order:2 pred:-0.326534 var:-nan coeffs:3.274787 -1.436430 >> -0.000000 > > You are definitely right, though. Removed 2 lines: > mova ymm3, COVAR(jq,3) > and > mova xmm3, COVAR(jq,2)
Still buggy. Will calmly examine this some time. > >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel