First, I've removed the hacked formula. While it seems to improve precision in the synthetic test, it is not enough to avoid assigning pulses in padded area. Worse, it also interferes with the special case with subtraction of pulses from y[i]==0 output. Handling these cases needs combining two masks and using "blendvps" instead the faster "maxps". Most of these problems are related to the fact that the hacked formula result is in the range -1.0 to 0.0 where 0.0 is the perfect match.
Second, fixed the all_float rounding mode, I got it swapped in v2. Third, reordered some code. Moved the branch for exact match of the pre-search, to be first taken. Load the constants needed in the pulse search only when they will be used. Use a macro for conditional loading/relabeling of the constants. Use similar idea for loading and using PIC register. Use "smartalign" on NASM, to avoid 15 consequitev 1 byte NOPs. YASM is smart by default. I attach a second patch, that is slightly modified version of atomnuker's code. It sums the distortions of C and SIMD implementations. In my test approx#2 seems to be pretty close to the C version, sometimes better, sometimes worse. If you find a file with dramatic difference, I'm interested. :D If there are no more issues, v4 would be cleaned up and ready for review. Best Regards.
From 17cbb22f1f3a9021332f95e38003b8ac4d714aec Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvac...@gmail.com> Date: Thu, 8 Jun 2017 22:24:33 +0300 Subject: [PATCH 1/4] SIMD opus pvq_search implementation v3 --- libavcodec/opus_pvq.c | 9 + libavcodec/opus_pvq.h | 5 +- libavcodec/x86/Makefile | 1 + libavcodec/x86/opus_dsp_init.c | 47 +++ libavcodec/x86/opus_pvq_search.asm | 651 +++++++++++++++++++++++++++++++++++++ 5 files changed, 711 insertions(+), 2 deletions(-) create mode 100644 libavcodec/x86/opus_dsp_init.c create mode 100644 libavcodec/x86/opus_pvq_search.asm diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c index 2ac66a0ede..6c7504296d 100644 --- a/libavcodec/opus_pvq.c +++ b/libavcodec/opus_pvq.c @@ -418,7 +418,13 @@ static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_ int *y = pvq->qcoeff; celt_exp_rotation(X, N, blocks, K, spread, 1); + + { + START_TIMER gain /= sqrtf(pvq->pvq_search(X, y, K, N)); + STOP_TIMER("pvq_search"); + } + celt_encode_pulses(rc, y, N, K); celt_normalize_residual(y, X, N, gain); celt_exp_rotation(X, N, blocks, K, spread, 0); @@ -947,6 +953,9 @@ int av_cold ff_celt_pvq_init(CeltPVQ **pvq) s->encode_band = pvq_encode_band; s->band_cost = pvq_band_cost; + if (ARCH_X86 && CONFIG_OPUS_ENCODER) + ff_opus_dsp_init_x86(s); + *pvq = s; return 0; diff --git a/libavcodec/opus_pvq.h b/libavcodec/opus_pvq.h index 6691494838..9246337360 100644 --- a/libavcodec/opus_pvq.h +++ b/libavcodec/opus_pvq.h @@ -33,8 +33,8 @@ float *lowband_scratch, int fill) struct CeltPVQ { - DECLARE_ALIGNED(32, int, qcoeff )[176]; - DECLARE_ALIGNED(32, float, hadamard_tmp)[176]; + DECLARE_ALIGNED(32, int, qcoeff )[256]; + DECLARE_ALIGNED(32, float, hadamard_tmp)[256]; float (*pvq_search)(float *X, int *y, int K, int N); @@ -45,6 +45,7 @@ struct CeltPVQ { }; int ff_celt_pvq_init (struct CeltPVQ **pvq); +void ff_opus_dsp_init_x86(struct CeltPVQ *s); void ff_celt_pvq_uninit(struct CeltPVQ **pvq); #endif /* AVCODEC_OPUS_PVQ_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 0dbc46504e..6687fbcd72 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -52,6 +52,7 @@ OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp_init.o x86/synth_filter_init.o OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc_init.o +OBJS-$(CONFIG_OPUS_ENCODER) += x86/opus_dsp_init.o x86/opus_pvq_search.o OBJS-$(CONFIG_HEVC_DECODER) += x86/hevcdsp_init.o OBJS-$(CONFIG_JPEG2000_DECODER) += x86/jpeg2000dsp_init.o OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp_init.o diff --git a/libavcodec/x86/opus_dsp_init.c b/libavcodec/x86/opus_dsp_init.c new file mode 100644 index 0000000000..ddccf6fe9e --- /dev/null +++ b/libavcodec/x86/opus_dsp_init.c @@ -0,0 +1,47 @@ +/* + * Opus encoder assembly optimizations + * Copyright (C) 2017 Ivan Kalvachev <ikalvac...@gmail.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "config.h" + +#include "libavutil/x86/cpu.h" +#include "libavcodec/opus_pvq.h" + +extern float ff_pvq_search_sse2 (float *X, int *y, int K, int N); +extern float ff_pvq_search_sse42(float *X, int *y, int K, int N); +extern float ff_pvq_search_avx (float *X, int *y, int K, int N); +extern float ff_pvq_search_avx2 (float *X, int *y, int K, int N); + +av_cold void ff_opus_dsp_init_x86(CeltPVQ *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) + s->pvq_search = ff_pvq_search_sse2; + + if (EXTERNAL_SSE42(cpu_flags)) + s->pvq_search = ff_pvq_search_sse42; + + if (EXTERNAL_AVX(cpu_flags)) + s->pvq_search = ff_pvq_search_avx; + + if (EXTERNAL_AVX2(cpu_flags)) + s->pvq_search = ff_pvq_search_avx2; +} diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm new file mode 100644 index 0000000000..91a00e4024 --- /dev/null +++ b/libavcodec/x86/opus_pvq_search.asm @@ -0,0 +1,651 @@ +; Opus encoder assembly optimizations +; Copyright (C) 2017 Ivan Kalvachev <ikalvac...@gmail.com> +; This file is part of FFmpeg + +%include "config.asm" +%include "libavutil/x86/x86util.asm" + +%ifdef __NASM_VER__ +%use "smartalign" +ALIGNMODE p6 +%endif + +%define HADDPS_IS_FAST 0 +%define PHADDD_IS_FAST 0 +%define BLENDVPS_IS_FAST 01 +%define PBLENDVB_IS_FAST 01 +%define CONST_IN_X64_REG_IS_FASTER 01 + +%define STALL_WRITE_FORWARDING 0 +%define SHORT_SYY_UPDATE 01 + +%define USE_APPROXIMATION 02 +%define PRESEARCH_ROUNDING 01 +%define ALL_FLOAT_PRESEARCH cpuflag(sse42) && 0 + + +; +; Horizontal Sum Packed Single precision float +; The resulting sum is in all elements. +; +%macro HSUMPS 2 ; dst, tmp +%if cpuflag(sse42) && HADDPS_IS_FAST + %if sizeof%1>=32 ; avx mmsize=32 + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) + addps %1, %2 + %endif ; sse&avx mmsize=16 + haddps %1, %1 ; 5-6c 2/1 + haddps %1, %1 +%elif cpuflag(avx) + %if sizeof%1>=32 ; avx + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%1)<<128+hi(%1) + addps %1, %2 + %endif + vshufps %2, %1, %1, q1032 ; %2=[b, a, d, c ] + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + vshufps %2, %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + +%elif 01 ; this form is a bit faster than the short avx-like emulation. + movaps %2, %1 ; [d, c, b, a ] + shufps %1, %1, q1032 ; %2=[b, a, d, c ] + addps %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + movaps %2, %1 + shufps %1, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + addps %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + ; all %1 members should be equal for as long as float a+b==b+a +%else + %error pick HSUMPS implementation +%endif +%endmacro + + +; +; Horizontal Sum integer Double word +; The resulting sum is in all elements. +; +%macro HSUMDQ 2 ; dst, tmp +%if cpuflag(avx) && notcpuflag(avx2) && sizeof%1>=32 + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. + vperm2f128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd xmm%1,xmm%2 + phaddd xmm%1,xmm%1 + phaddd xmm%1,xmm%1 + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1 +%elif cpuflag(ssse3) && PHADDD_IS_FAST + %if sizeof%1>=32 + vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd %1, %2 + %endif ;mmsize == 16 + phaddd %1, %1 ;3 cycles on intel + phaddd %1, %1 + ;6 cycles total (sse) +%elif 01 + %if cpuflag(avx2) && sizeof%1>=32 ;mmsize>=32 + vperm2i128 %2, %1, %1, (0)*16+(1) ; %2=lo(%2)<<128+hi(%2) + paddd %1, %2 + %endif + pshufd %2, %1, q1032 ; %2=[b, a, d, c ] + paddd %1, %2 ; %1=[b+d, a+c, d+b, c+a ] + pshufd %2, %1, q0321 ; %2=[c+a, b+d, a+c, d+b ] + paddd %1, %2 ; %1=[c+a+b+d, b+d+a+c, a+c+d+b, d+b+c+a ] + ;4 cycles total (sse) +%else + %error pick HSUMDQ implementation +%endif +%endmacro + + +; +; Emulate blendvps if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard fixed to use xmm0 as mask +; +%macro emu_blendvps 3 ; dst_a, src_b, mask +%if cpuflag(avx) && BLENDVPS_IS_FAST + blendvps %1, %1, %2, %3 +%elif cpuflag(sse42) && BLENDVPS_IS_FAST + %if notcpuflag(avx) + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3d operand, you used %3 + %endif + %endif + blendvps %1, %2, %3 +%elif 01 + xorps %2, %1 + andps %2, %3 + xorps %1, %2 +%else + %error pick "blendvps" emulation +%endif +%endmacro + + +; +; Emulate pblendvb if not available +; +; src_b destroyed when using emulation with logical operands +; SSE41 blend instruction is hard fixed to use xmm0 as mask +; +%macro emu_pblendvb 3 ; dst_a, src_b, mask +%if cpuflag(avx2) && PBLENDVB_IS_FAST + pblendvb %1, %1, %2, %3 +;------------------------- +%elif cpuflag(avx) && notcpuflag(avx2) && PBLENDVB_IS_FAST + %if sizeof%1>=32 +;AVX1 does not implement pblendvb that works with ymm +;AVX1 does not implemnt pxor that works with ymm either, +; so no emulation possible +; Use float point blend and hope the penalty is not severe + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. + vblendvps %1, %1, %2, %3 + %else + vpblendvb %1, %1, %2, %3 + %endif +;------------------------- +%elif cpuflag(sse42) && PBLENDVB_IS_FAST + %ifnidn %3,xmm0 + %error sse41 blendvps uses xmm0 as default 3 operand, you used %3 + %endif + pblendvb %1, %2, %3 +;------------------------- +%elif 01 +;On my cpu, this version is actually faster than pblendvb ! +;pblendvb is much slower than blenvps, go figure! + pxor %2, %1 + pand %2, %3 + pxor %1, %2 +%else + %error pick "pblendvb" emulation +%endif +%endmacro + + +; +; Emulate broadcastss if not available +; +%macro emu_broadcastss 2 ; dst_a, src_b +%if cpuflag(avx2) && 01 + vbroadcastss %1, %2 ; ymm, xmm + +%elif cpuflag(avx) && 01 + %ifnum sizeof%2 ; avx1 register + vpermilps xmm%1, xmm%2, q0000 ; xmm, xmm, imm || ymm, ymm, imm + %if sizeof%1 >= 32 ;mmsize>=32 + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm, im ; 3c 1/1 +; vperm2f128 %1, %1, %1, 0 ; ymm, ymm, ymm ; 2-3c 1/1 + %endif + %else ; avx1 memory + vbroadcastss %1, %2 ; ymm, mm32 || xmm, mm32 + %endif +%elif cpuflag(sse) && 1 + %ifnum sizeof%2 ; sse register + shufps %1, %2, %2, q0000 + %else ; sse memory +; xorps %1, %1 ; this seems to provide minor speedup + movss %1, %2 ; this zeroes the other 3 elements + shufps %1, %1, 0 + %endif +%else + %error invalid op, SSE needed for emulation +%endif +%endmacro + +; +; Emulate pbroadcastd if not available +; +%macro emu_pbroadcastd 2 ; dst_a, src_b +%if cpuflag(avx2) && 01 + vpbroadcastd %1, %2 +%elif cpuflag(avx) && mmsize >= 32 + ;there are no integer operations with 256bit ymm registers on AVX1 + pshufd xmm%1, xmm%2, q0000 ; xmm, xmm, im + vinsertf128 %1, %1,xmm%1, 1 ; ymm, ymm, xmm. im + %error emulating integer 256bit ymm operation with float point op. there will be speed penalty. +%elif cpuflag(sse) + %ifnum sizeof%2 ; sse register + pshufd %1, %2, q0000 + %else ; sse memory +; pxor %1, %1 ; this seems to provide minor speedup. On intel it is direct op, so no micro-ops, letency or throughput. + movd %1, %2 ; movd zero extends + pshufd %1, %1, 0 + %endif +%else + %error invalid op, SSE2 needed +%endif +%endmacro + +; +; Setup High register to be used +; for holding memory constants +; +; %1 - the register to be used, assmues it is >= mm8 +; %2 - name of the constant. +; +; Subsequent opcodes are going to use the constant in the form +; "addps m0, mm_const_name" and it would be turned into: +; "addps m0, [const_name]" on 32bit arch or +; "addps m0, m8" or 64 bit arch +%macro set_hi_mm_constant 2 ; reg, const_label +%if num_mmregs > 8 && CONST_IN_X64_REG_IS_FASTER + %define mm_%2 %1 + movaps %1, [%2] +%else + %define mm_%2 [%2] +%endif +%endmacro + +; +; Load Effective Address of +; Position Independent Code +; Base address of a constant +; %1 - the register to be used, if PIC is set +; %2 - name of the constant. +; +; Subsequent optcode are going to use the base address in the form +; "movaps m0, [pic_base_constant_name+r4q]" and it would be turned into +; "movaps m0, [r5q + r4q]" if PIC is enabled +; "movaps m0, [constant_name + r4q]" if texrel are used +%macro lea_pic_base 2; reg, const_label +%ifdef PIC + lea %1, [%2] ; rip+const + %define pic_base_%2 %1 +%else + %define pic_base_%2 %2 +%endif +%endmacro + + +;big ugly hack +%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32 +; AVX1 does not implement integer operations with 256bit ymm registers +; Some operations however could be emulated with float point operations + %define ALL_FLOAT_PRESEARCH 1 + %define pxor xorps + %define por orps + %define pand andps + %define vextracti128 vextractf128 + %define emu_pblendvb emu_blendvps + %define emu_pbroadcastd emu_broadcastss +%endif + + +SECTION_RODATA 64 + +const_float_abs_mask: times 8 dd 0x7fffffff +const_align_abs_edge: times 8 dd 0 + +const_float_0_5: times 8 dd 0.5 +const_float_1: times 8 dd 1.0 +const_float_2: times 8 dd 2.0 +const_float_sign_mask: times 8 dd 0x80000000 + +const_int32_offsets: + %rep 8 + dd $-const_int32_offsets + %endrep +SECTION .text + + + + + +%macro PULSES_SEARCH 1 +; m6 Syy_norm +; m7 Sxy_norm + xor r4q, r4q + pxor m1, m1 ; max_idx + xorps m3, m3 ; p_max + addps m6, mm_const_float_1 ; Syy_norm += 1.0 +align 16 +%%distortion_search: + movd xmm2, dword r4d ; movd zero extends +%ifidn %1,add + movaps m4, [tmpY + r4q] ; y[i] + movaps m5, [tmpX + r4q] ; X[i] + +%if USE_APPROXIMATION >= 1 + xorps m0, m0 + cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) +%endif + + addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm + addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm + +%if USE_APPROXIMATION >= 1 + andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. +%endif + +; addps m4, m6, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i] +; addps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i] + +; movaps m4, m6 ; Syy_norm +; movaps m5, m7 ; Sxy_norm +; addps m4, [tmpY + r4q] ; m4 = Syy_new = Syy_norm + y[i] +; addps m5, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm + X[i] +%else + movaps m5, [tmpY + r4q] ; m5 = y[i] + + xorps m0, m0 ; m0 = 0; + cmpps m0, m0, m5, 1 ; m0 = (0<y) + + subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i] + subps m5, m7, [tmpX + r4q] ; m5 = Sxy_new = Sxy_norm - X[i] + andps m5, m0 ; (0<y)?m5:0 +%endif + +%if USE_APPROXIMATION == 2 + rsqrtps m4, m4 + mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) +%elif USE_APPROXIMATION == 1 + mulps m5, m5 + rcpps m4, m4 + mulps m5, m4 ; m5 = p = (Sxy_new*Sxy_new - Syy)*approx(1/Syy) +%else + mulps m5, m5 + divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy +%endif + emu_pbroadcastd m2, xmm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later) + + cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max) + maxps m3, m5 ; m3=max(p_max,p) + ; maxps here is faster than blendvps, despite blend having lower latency. + ; maxps is also shorter so the inner loop fits in icache segment +; emu_blendvps m3, m5, m0 ; m3 = m0 ? p : p_max + ; SSE4.1 is using hardcoded xmm0 register as third argument +%if cpuflag(avx) && notcpuflag(avx2) && mmsize == 32 + emu_pblendvb m1, m2, m0 +%elif 01 + pand m2, m0 ; This version seems faster than sse41 pblendvb + pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4 +%else + pandn m0, m1 ; version for r4q decreasing. It's not faster than real blendv. + pmaxsw m0, m2 + movdqa m1, m0 +%endif + + add r4q, mmsize + cmp r4q, Nq + jb %%distortion_search + + por m1, mm_const_int32_offsets ; max_idx offset per individual lane (skipped in the inner loop) + movdqa m4, m1 ; needed for the aligned y[max_idx]+=2; processing + +%if mmsize >= 32 +;merge parallel maximums round 2 (4 vs 4) + + vextractf128 xmm5, ymm3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b] + cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] ) + + vextracti128 xmm2, ymm1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b] + emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128] + emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[1x128] : p[0x128] +%endif + +;merge parallel maximums round 1 (2 vs 2) + ; m3=p[3210] + movhlps xmm5, xmm3 ; m5=p[xx32] + cmpps xmm0, xmm3, xmm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] ) + + pshufd xmm2, xmm1, q3232 + emu_blendvps xmm3, xmm5, xmm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0] + emu_pblendvb xmm1, xmm2, xmm0 ; p = m0 ? p[3,2] : p[1,0] + +;merge parallel maximums final round (1 vs 1) + movaps xmm0, xmm3 ; m0 = m3[0] = p[0] + shufps xmm3, xmm3, q1111 ; m3 = m3[1] = p[1] + cmpless xmm0, xmm3 + + pshufd xmm2, xmm1, q1111 + emu_pblendvb xmm1, xmm2, xmm0 + + movd dword r4d, xmm1 ; zero extends to the rest of r4q + +;pextrw r4d,m1,0 ; extract word (16b) from pos0 to r4d + + emu_broadcastss m3, [tmpX + r4q] + %{1}ps m7, m3 ; Sxy += X[max_idx] + +%if STALL_WRITE_FORWARDING == 0 + %if SHORT_SYY_UPDATE == 1 + emu_broadcastss m5, [tmpY + r4q] + %{1}ps m6, m5 ; Syy += Y[max_idx] + %endif + ; this code is big and ugly + ; but it prevents stalling a store forwarding + ; by writing smaller size and then doing + ; big load from same address. + ; This helps with N<100 that are the typical case. + emu_pbroadcastd m1, xmm1 + pcmpeqd m1, m1, m4 ; only 1 element matches the address of the new p_max + + + and r4q, ~(mmsize-1) ; align down, so max_idx is (somewhere) inside mmsize load + movaps m5, [tmpY + r4q] ; m5 = Y[y3...ym...y0] + movaps m3, mm_const_float_2 ; m3 = [2.0, 2.0, 2.0, 2.0] + %if SHORT_SYY_UPDATE == 0 + movaps m2, m5 ; m2 = m5 + %endif + andps m3, m1 ; m3 = [ 0...2.0...0] + %{1}ps m5, m3 ; m5 = Y[y3...ym...y0] +/- [0...2.0...0] + movaps [tmpY + r4q], m5 ; Y[max_idx] +-= 2.0; + + %if SHORT_SYY_UPDATE == 0 + andps m2, m1 ; m2 = Y[0...ym...0] + HSUMPS m2, m3 ; m2 = Y[ym, ym, ym, ym]; + %{1}ps m6, m2 ; Syy += Y[max_idx] + %endif +%else + ; this code is shorter, simpler and slower + ; because it breaks store forwarding + ; it writes smaller size at unaligned address where we later do a bigger aligned load. + ; When we have biger vectors (N>100) the store might complete before we try to load it + emu_broadcastss m4, [tmpY + r4q] + %{1}ps m6, m4 ; Syy += outY[max_idx] + %{1}ps m4, mm_const_float_2 + movss [tmpY + r4q], xmm4 ; Y[max_idx] += 2.0; +%endif + +%endmacro + +; +; Pyramid Vector Quantization Search implementation +; +; float * inX - Unaligned (SIMD) access, it will be overread, +; but extra data will be masked away. +; int32 * outY - Should be aligned and padded buffer. +; It would be used as temp buffer. +; uint32 K - Number of pulses to have after quantizations. +; uint32 N - Number of vector elements. Must be 0 < N < 256 +; +%macro PVQ_FAST_SEARCH 0 +cglobal pvq_search,4,6,8, 256*4, inX, outY, K, N +%define tmpX rsp +%define tmpY outYq + + +; movsxdifnidn Nq, Nd + movaps m0, [const_float_abs_mask] + shl Nq, 2 ; N *= sizeof(float) + mov r4q, Nq + + neg r4q + and r4q, mmsize-1 + + lea_pic_base r5q, const_align_abs_edge ; rip+const + movups m3, [pic_base_const_align_abs_edge + r4q - mmsize] + + add Nq, r4q ; Nq = align(Nq, mmsize) +; sub rsp, Nq ; allocate tmpX[Nq] + + lea r4q, [Nq - mmsize] ; Nq is rounded up (aligned up) to mmsize, so r4q can't become negative here, unless N=0. + movups m2, [inXq + r4q] + andps m2, m3 + movaps [tmpX + r4q], m2 + movaps m1, m2 ; Sx + +align 16 +%%loop_abs_sum: + sub r4q, mmsize + jc %%end_loop_abs_sum + + movups m2, [inXq + r4q] + andps m2, m0 + + movaps [tmpX + r4q], m2 ; tmpX[i]=abs(X[i]) + addps m1, m2 + jmp %%loop_abs_sum + + +align 16 +%%end_loop_abs_sum: + + HSUMPS m1, m2 ; m1 = Sx + + xorps m0, m0 + comiss xmm0, xmm1 ; + jz %%zero_input ; if (Sx==0) goto zero_input + + cvtsi2ss xmm0, dword Kd ; m0 = K +%if USE_APPROXIMATION >= 1 && 0 + rcpss xmm1, xmm1 ; m1 = approx(1/Sx) + mulss xmm0, xmm1 ; m0 = K*(1/Sx) +%else + divss xmm0, xmm1 ; b = K/Sx + ; b = K/max_x +%endif + + emu_broadcastss m0, xmm0 + + lea r4q, [Nq - mmsize] + pxor m5, m5 ; Sy ( Sum of y[i] ) + xorps m6, m6 ; Syy ( Sum of y[i]*y[i] ) + xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] ) +align 16 +%%loop_guess: + movaps m1, [tmpX + r4q] ; m1 = X[i] + mulps m2, m0, m1 ; m2 = res*X[i] +%if ALL_FLOAT_PRESEARCH == 1 + %if PRESEARCH_ROUNDING == 0 + roundps m2, m2, 3 ; yt = (float)truncf( res*X[i] ) + %else + roundps m2, m2, 0 ; yt = (float)lrintf( res*X[i] ) + %endif + addps m5, m2 ; Sy += yt +%else + %if PRESEARCH_ROUNDING == 0 + cvttps2dq m2, m2 ; yt = (int)truncf( res*X[i] ) + %else + cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] ) + %endif + paddd m5, m2 ; Sy += yt + cvtdq2ps m2, m2 ; yt = (float)yt +%endif + mulps m1, m2 ; m1 = X[i]*yt + mulps m3, m2, m2 ; m3 = yt*yt + addps m7, m1 ; Sxy += m1; + addps m6, m3 ; Syy += m3 + addps m2, m2 ; m2 = 2*yt + movaps [tmpY + r4q], m2 ; y[i] = m2 + + sub r4q, mmsize + jnc %%loop_guess +; add r4q, mmsize +; cmp r4q, Nq +; jb %%loop_guess + + + HSUMPS m6, m1 ; Syy_norm + HSUMPS m7, m0 ; Sxy_norm + +%if ALL_FLOAT_PRESEARCH == 1 + HSUMPS m5, m4 ; pulses + cvtss2si r4q, xmm5 +%else + HSUMDQ m5, m4 ; pulses + movd dword r4d, xmm5 ; zero extends to the rest of r4q +%endif + + + sub Kq, r4q ; K -= pulses + jz %%finish ; K - pulses == 0 + + set_hi_mm_constant m11, const_float_1 + set_hi_mm_constant m12, const_float_2 + set_hi_mm_constant m13, const_int32_offsets + + jc %%remove_pulses_loop ; K - pulses < 0 + +align 16 ; K - pulses > 0 +%%add_pulses_loop: + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm + + sub Kq, 1 + jnz %%add_pulses_loop + + jmp %%finish + +align 16 +%%remove_pulses_loop: + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm + + add Kq, 1 + jnz %%remove_pulses_loop + + +align 16 +%%finish: + lea r4q, [Nq - mmsize] + + movaps m2, [const_float_sign_mask] + movaps m4, [const_float_0_5] +align 16 +%%restore_sign_loop: + movaps m0, [tmpY + r4q] ; m0 = Y[i] + movups m1, [inXq + r4q] ; m1 = X[i] + andps m1, m2 ; m1 = sign(X[i]) + mulps m0, m4 ; m0 = Y[i]*0.5 + orps m0, m1 ; m0 = Y[i]*0.5*sign + cvtps2dq m3, m0 ; m3 = (int)m0 + movaps [outYq + r4q], m3 + + sub r4q, mmsize + jnc %%restore_sign_loop +%%return: +;release the temp buffer and restore the stack +; add rsp, Nq ; free tmpX, Nq is the same as the one used to allocate it (after Nq has been aligned up) + +%if ARCH_X86_64 == 0 ;sbrdsp + movss r0m, xmm6 ; return (float)Syy_norm + fld dword r0m +%else + movaps m0, m6 ; return (float)Syy_norm +%endif + RET + +align 16 +%%zero_input: + lea r4q, [Nq - mmsize] + xorps m0, m0 +%%zero_loop: + movaps [outYq + r4q], m0 + sub r4q, mmsize + jnc %%zero_loop + + movaps m6, [const_float_1] + jmp %%return +%endmacro + + +INIT_XMM sse2 +PVQ_FAST_SEARCH + +INIT_XMM sse42 +PVQ_FAST_SEARCH + +INIT_XMM avx +PVQ_FAST_SEARCH + +INIT_YMM avx2 +PVQ_FAST_SEARCH -- 2.13.0
From a1005984a6e144fcd1701b4b71e9de5a17ebbca9 Mon Sep 17 00:00:00 2001 From: Ivan Kalvachev <ikalvac...@gmail.com> Date: Sat, 1 Jul 2017 00:34:22 +0300 Subject: [PATCH 2/4] Code to measure the total distortion sum of the reference and dsp implementation. --- libavcodec/opus_pvq.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/libavcodec/opus_pvq.c b/libavcodec/opus_pvq.c index 6c7504296d..3ad995e7a4 100644 --- a/libavcodec/opus_pvq.c +++ b/libavcodec/opus_pvq.c @@ -420,10 +420,77 @@ static uint32_t celt_alg_quant(OpusRangeCoder *rc, float *X, uint32_t N, uint32_ celt_exp_rotation(X, N, blocks, K, spread, 1); { - START_TIMER - gain /= sqrtf(pvq->pvq_search(X, y, K, N)); - STOP_TIMER("pvq_search"); + static double adist1 = 0.0f; + static double adist2 = 0.0f; + static int cnt,cntlg; + float xn1[256]; + float xn2[256]; + int ysec[256]; + + float sq1 = 0.0f; + float sq2 = 0.0f; + + float dist1 = 0.0f; + float dist2 = 0.0f; + + int sum1 = 0, sum2 = 0; + + // START_TIMER + gain /= sqrtf(pvq->pvq_search(X, y, K, N)); + // STOP_TIMER("pvq_search"); + + ppp_pvq_search_c(X, ysec, K, N); + + + for (int i = 0; i < N; i++) { + sq1 += y[i]*y[i]; + sq2 += ysec[i]*ysec[i]; + sum1 += FFABS(y[i]); + sum2 += FFABS(ysec[i]); + } + + if (sum1 != K) { + printf("\nERROR! sum Sy=%i K=%i N=%d \n", sum1, K, N); + printf(" X = "); + for(int i=0; i<N; i++){ + printf("%8X, ",((unsigned int *)X)[i]); + } + printf("\n y1 = "); + for(int i=0; i<N; i++){ + printf("%8d, ", y[i]); + } + printf("\n y2 = "); + for(int i=0; i<N; i++){ + printf("%8d, ", ysec[i]); + } + printf("\nERROR!\n"); + goto what; + //av_assert0(0); + } + + + for (int i = 0; i < N; i++) { + xn1[i] = (float)y[i]/sq1; + xn2[i] = (float)ysec[i]/sq2; + dist1 += (X[i] - xn1[i])*(X[i] - xn1[i]); + dist2 += (X[i] - xn2[i])*(X[i] - xn2[i]); + } + + dist1 = sqrtf(dist1); + dist2 = sqrtf(dist2); + + adist1 += (double)dist1; + adist2 += (double)dist2; + + cnt++; + if(cnt>=2*cntlg){ + cntlg=cnt; + printf("\nruns = %d\n", cnt); + printf("Distortion1 = %f\n", adist1); + printf("Distortion2 = %f\n", adist2); + } } +what: celt_encode_pulses(rc, y, N, K); celt_normalize_residual(y, X, N, gain); -- 2.13.0
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel