I ask for revert, instead you commit more on it. Ignoring everything I said on irc and in mail.
On 8/18/17, Rostislav Pehlivanov <g...@videolan.org> wrote: > ffmpeg | branch: master | Rostislav Pehlivanov <atomnu...@gmail.com> | Fri > Aug 18 19:29:33 2017 +0100| [3c99523a2864af729a8576c3fffe81fb884fa0d5] | > committer: Rostislav Pehlivanov > > opus_pvq_search: split functions into exactness and only use the exact if > its faster > > This splits the asm function into exact and non-exact version. The exact > version is as fast or faster on newer CPUs (which EXTERNAL_AVX_FAST > describes > well) whilst the non-exact version is faster than the exact on older CPUs. > > Also fixes yasm compilation which doesn't accept !cpuflags(avx) syntax. > > Signed-off-by: Rostislav Pehlivanov <atomnu...@gmail.com> > >> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=3c99523a2864af729a8576c3fffe81fb884fa0d5 > --- > > libavcodec/x86/opus_dsp_init.c | 14 +++++++------- > libavcodec/x86/opus_pvq_search.asm | 34 +++++++++++++++++++++------------- > 2 files changed, 28 insertions(+), 20 deletions(-) > > diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c > index c51f786ee8..a9f8a96159 100644 > --- a/libavcodec/x86/opus_dsp_init.c > +++ b/libavcodec/x86/opus_dsp_init.c > @@ -24,9 +24,9 @@ > #include "libavutil/x86/cpu.h" > #include "libavcodec/opus_pvq.h" > > -extern float ff_pvq_search_sse2(float *X, int *y, int K, int N); > -extern float ff_pvq_search_sse4(float *X, int *y, int K, int N); > -extern float ff_pvq_search_avx (float *X, int *y, int K, int N); > +extern float ff_pvq_search_approx_sse2(float *X, int *y, int K, int N); > +extern float ff_pvq_search_approx_sse4(float *X, int *y, int K, int N); > +extern float ff_pvq_search_exact_avx (float *X, int *y, int K, int N); > > av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) > { > @@ -34,12 +34,12 @@ av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) > > #if CONFIG_OPUS_ENCODER > if (EXTERNAL_SSE2(cpu_flags)) > - s->pvq_search = ff_pvq_search_sse2; > + s->pvq_search = ff_pvq_search_approx_sse2; > > if (EXTERNAL_SSE4(cpu_flags)) > - s->pvq_search = ff_pvq_search_sse4; > + s->pvq_search = ff_pvq_search_approx_sse4; > > - if (EXTERNAL_AVX(cpu_flags)) > - s->pvq_search = ff_pvq_search_avx; > + if (EXTERNAL_AVX_FAST(cpu_flags)) > + s->pvq_search = ff_pvq_search_exact_avx; > #endif > } > diff --git a/libavcodec/x86/opus_pvq_search.asm > b/libavcodec/x86/opus_pvq_search.asm > index 2f4864c95c..8cf040465d 100644 > --- a/libavcodec/x86/opus_pvq_search.asm > +++ b/libavcodec/x86/opus_pvq_search.asm > @@ -82,7 +82,7 @@ SECTION .text > %endif > %endmacro > > -%macro PULSES_SEARCH 1 > +%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation > ; m6 Syy_norm > ; m7 Sxy_norm > addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 > @@ -96,7 +96,7 @@ align 16 > movaps m4, [tmpY + r4] ; y[i] > movaps m5, [tmpX + r4] ; X[i] > > -%if !cpuflag(avx) ; for crappy ancient CPUs that have slow packed divs but > fast 1/sqrt > +%if %2 > xorps m0, m0 > cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) > %endif > @@ -104,7 +104,7 @@ align 16 > addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm > addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm > > -%if !cpuflag(avx) > +%if %2 > andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent > aproximation error from setting pulses in array padding. > %endif > > @@ -119,7 +119,7 @@ align 16 > andps m5, m0 ; (0<y)?m5:0 > %endif > > -%if !cpuflag(avx) > +%if %2 > rsqrtps m4, m4 > mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) > %else > @@ -211,8 +211,13 @@ align 16 > ; uint32 K - Number of pulses to have after quantizations. > ; uint32 N - Number of vector elements. Must be 0 < N < 256 > ; > -%macro PVQ_FAST_SEARCH 0 > -cglobal pvq_search, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N > +%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation > +%if %1 > +cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N > +%else > +cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N > +%endif > + > %define tmpX rsp > %define tmpY outYq > > @@ -255,7 +260,7 @@ align 16 > jz %%zero_input ; if (Sx==0) goto zero_input > > cvtsi2ss xm0, dword Kd ; m0 = K > -%if !cpuflag(avx) > +%if %1 > rcpss xm1, xm1 ; m1 = approx(1/Sx) > mulss xm0, xm1 ; m0 = K*(1/Sx) > %else > @@ -308,7 +313,7 @@ align 16 > align 16 ; K - pulses > 0 > %%add_pulses_loop: > > - PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm > + PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm > > sub Kd, 1 > jnz %%add_pulses_loop > @@ -320,7 +325,7 @@ align 16 ; K - pulses > 0 > align 16 > %%remove_pulses_loop: > > - PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm > + PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm > > add Kd, 1 > jnz %%remove_pulses_loop > @@ -367,12 +372,15 @@ align 16 > jmp %%return > %endmacro > > - > +; if 1, use a float op that give half precision but execute for around 3 > cycles. > +; On Skylake & Ryzen the division is much faster (around 11c/3), > +; that makes the full precision code about 2% slower. > +; Opus also does use rsqrt approximation in their intrinsics code. > INIT_XMM sse2 > -PVQ_FAST_SEARCH > +PVQ_FAST_SEARCH 1 > > INIT_XMM sse4 > -PVQ_FAST_SEARCH > +PVQ_FAST_SEARCH 1 > > INIT_XMM avx > -PVQ_FAST_SEARCH > +PVQ_FAST_SEARCH 0 I asked you to turn these into e.g. : %define USE_APPROXIMATION 0 INIT_XMM avx PVQ_FAST_SEARCH _exact where: %macro PVQ_FAST_SEARCH 1 cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N _______________________________________________ ffmpeg-cvslog mailing list ffmpeg-cvslog@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog