On 1/14/2016 7:46 PM, Ganesh Ajjanagadde wrote: > This improves accuracy (very slightly) and speed for processors having > fma3. > > Sample benchmark (fate flac-16-lpc-cholesky, Haswell): > old: > 5993610 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips > 5951528 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips > > new: > 5252410 decicycles in ff_lpc_calc_coefs, 64 runs, 0 skips > 5232869 decicycles in ff_lpc_calc_coefs, 128 runs, 0 skips > > Tested with FATE and --disable-fma3, also examined contents of > lavu/lls-test. > > Reviewed-by: James Almer <jamr...@gmail.com> > Reviewed-by: Henrik Gramner <hen...@gramner.com> > Signed-off-by: Ganesh Ajjanagadde <gajjanaga...@gmail.com> > --- > libavutil/x86/lls.asm | 61 > ++++++++++++++++++++++++++++++++++++++++++++++-- > libavutil/x86/lls_init.c | 4 ++++ > 2 files changed, 63 insertions(+), 2 deletions(-) > > diff --git a/libavutil/x86/lls.asm b/libavutil/x86/lls.asm > index 769befb..925cbdb 100644 > --- a/libavutil/x86/lls.asm > +++ b/libavutil/x86/lls.asm > @@ -125,8 +125,7 @@ cglobal update_lls, 2,5,8, ctx, var, i, j, covar2 > .ret: > REP_RET > > -%if HAVE_AVX_EXTERNAL > -INIT_YMM avx > +%macro UPDATE_LLS 0 > cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > %define covarq ctxq > mov countd, [ctxq + LLSModel.indep_count] > @@ -140,6 +139,18 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > vbroadcastsd ymm6, [varq + iq*8 + 16] > vbroadcastsd ymm7, [varq + iq*8 + 24] > vextractf128 xmm3, ymm1, 1 > +%if cpuflag(fma3) > + mova ymm0, COVAR(iq ,0) > + mova xmm2, COVAR(iq+2,2) > + fmaddpd ymm0, ymm1, ymm4, ymm0 > + fmaddpd xmm2, xmm3, xmm6, xmm2 > + fmaddpd ymm1, ymm5, ymm1, COVAR(iq ,1) > + fmaddpd xmm3, xmm7, xmm3, COVAR(iq+2,3) > + mova COVAR(iq ,0), ymm0 > + mova COVAR(iq ,1), ymm1 > + mova COVAR(iq+2,2), xmm2 > + mova COVAR(iq+2,3), xmm3 > +%else > vmulpd ymm0, ymm1, ymm4 > vmulpd ymm1, ymm1, ymm5 > vmulpd xmm2, xmm3, xmm6 > @@ -148,12 +159,27 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > ADDPD_MEM COVAR(iq ,1), ymm1 > ADDPD_MEM COVAR(iq+2,2), xmm2 > ADDPD_MEM COVAR(iq+2,3), xmm3 > +%endif ; cpuflag(fma3) > lea jd, [iq + 4] > cmp jd, count2d > jg .skip4x4 > .loop4x4: > ; Compute all 16 pairwise products of a 4x4 block > mova ymm3, [varq + jq*8] > +%if cpuflag(fma3) > + mova ymm0, COVAR(jq, 0) > + mova ymm1, COVAR(jq, 1) > + mova ymm2, COVAR(jq, 2) > + mova ymm3, COVAR(jq, 3)
This is wrong. You're overwriting the contents of ymm3. libavutil/lls-test didn't reflect this at all? > + fmaddpd ymm0, ymm3, ymm4, ymm0 > + fmaddpd ymm1, ymm3, ymm5, ymm1 > + fmaddpd ymm2, ymm3, ymm6, ymm2 > + fmaddpd ymm3, ymm3, ymm7, ymm3 > + mova COVAR(jq, 0), ymm0 > + mova COVAR(jq, 1), ymm1 > + mova COVAR(jq, 2), ymm2 > + mova COVAR(jq, 3), ymm3 > +%else > vmulpd ymm0, ymm3, ymm4 > vmulpd ymm1, ymm3, ymm5 > vmulpd ymm2, ymm3, ymm6 > @@ -162,6 +188,7 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > ADDPD_MEM COVAR(jq,1), ymm1 > ADDPD_MEM COVAR(jq,2), ymm2 > ADDPD_MEM COVAR(jq,3), ymm3 > +%endif ; cpuflag(fma3) > add jd, 4 > cmp jd, count2d > jle .loop4x4 > @@ -169,6 +196,20 @@ cglobal update_lls, 3,6,8, ctx, var, count, i, j, count2 > cmp jd, countd > jg .skip2x4 > mova xmm3, [varq + jq*8] > +%if cpuflag(fma3) > + mova xmm0, COVAR(jq, 0) > + mova xmm1, COVAR(jq, 1) > + mova xmm2, COVAR(jq, 2) > + mova xmm3, COVAR(jq, 3) Same here. _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel