Please revert this. Also, in future, please send a patch to the maillist so I could take a look on it.
On 8/18/17, Rostislav Pehlivanov <g...@videolan.org> wrote: > ffmpeg | branch: master | Rostislav Pehlivanov <atomnu...@gmail.com> | Fri > Aug 18 17:28:40 2017 +0100| [f386dd70acdc81d42d6bcb885d2889634cdf45b7] | > committer: Rostislav Pehlivanov > > opus_pvq_search: only use rsqrtps approximation on CPUs with avx You got the commit message in reverse. "rsqrtps" is the instruction that uses approximation and is faster. AVX CPUs are "allegedly" the ones that have faster divps, so the speed hit is not so bad. You want to use divps on new cpu and rsqrtps on old ones, not the reverse. > Makes the search produce idential results with the C version. > > Signed-off-by: Rostislav Pehlivanov <atomnu...@gmail.com> > >> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f386dd70acdc81d42d6bcb885d2889634cdf45b7 > --- > > libavcodec/x86/opus_pvq_search.asm | 18 ++++++------------ > 1 file changed, 6 insertions(+), 12 deletions(-) > > diff --git a/libavcodec/x86/opus_pvq_search.asm > b/libavcodec/x86/opus_pvq_search.asm > index beb6cbcc9b..2f4864c95c 100644 > --- a/libavcodec/x86/opus_pvq_search.asm > +++ b/libavcodec/x86/opus_pvq_search.asm > @@ -28,12 +28,6 @@ > ALIGNMODE p6 > %endif > > -; Use float op that give half precision but execute for around 3 cycles. > -; On Skylake & Ryzen the division is much faster (around 11c/3), > -; that makes the full precision code about 2% slower. > -; Opus also does use rsqrt approximation in their intrinsics code. > -%define USE_APPROXIMATION 1 > - > SECTION_RODATA 64 > > const_float_abs_mask: times 8 dd 0x7fffffff > @@ -102,17 +96,17 @@ align 16 > movaps m4, [tmpY + r4] ; y[i] > movaps m5, [tmpX + r4] ; X[i] > > - %if USE_APPROXIMATION == 1 > +%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but These changes are not necessary and they do obfuscate the code. The define name also documents what the different code path does. Using just "not AVX" doesn't explain why we use one code path and not the other. > fast 1/sqrt > xorps m0, m0 > cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) > - %endif > +%endif > > addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm > addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm > > - %if USE_APPROXIMATION == 1 > +%if !cpuflag(avx) > andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent > aproximation error from setting pulses in array padding. > - %endif > +%endif > > %else > movaps m5, [tmpY + r4] ; m5 = y[i] > @@ -125,7 +119,7 @@ align 16 > andps m5, m0 ; (0<y)?m5:0 > %endif > > -%if USE_APPROXIMATION == 1 > +%if !cpuflag(avx) > rsqrtps m4, m4 > mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) > %else > @@ -261,7 +255,7 @@ align 16 > jz %%zero_input ; if (Sx==0) goto zero_input > > cvtsi2ss xm0, dword Kd ; m0 = K > -%if USE_APPROXIMATION == 1 > +%if !cpuflag(avx) > rcpss xm1, xm1 ; m1 = approx(1/Sx) > mulss xm0, xm1 ; m0 = K*(1/Sx) > %else > The whole patch could be replaced with one liner: @@ -391,5 +391,7 @@ PVQ_FAST_SEARCH INIT_XMM sse4 PVQ_FAST_SEARCH +%define USE_APPROXIMATION 0 + INIT_XMM avx PVQ_FAST_SEARCH Best Regards Ivan Kalvachev _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog