On Sat, Dec 23, 2017 at 09:52:11PM +0100, Aurelien Jacobs wrote: > On Sat, Dec 23, 2017 at 05:47:04PM -0300, James Almer wrote: > > On 12/23/2017 5:44 PM, Aurelien Jacobs wrote: > > > On Sat, Dec 23, 2017 at 03:35:28PM -0300, James Almer wrote: > > >> On 12/23/2017 3:01 PM, Aurelien Jacobs wrote: > > >>> This was originally based on libsbc, and was fully integrated into > > >>> ffmpeg. > > >>> > > >>> Rough speed test: > > >>> C version: speed= 592x > > >>> MMX version: speed= 785x > > >>> --- > > >>> libavcodec/sbcdsp.c | 3 + > > >>> libavcodec/sbcdsp.h | 2 + > > >>> libavcodec/x86/Makefile | 2 + > > >>> libavcodec/x86/sbcdsp.asm | 284 > > >>> +++++++++++++++++++++++++++++++++++++++++++ > > >>> libavcodec/x86/sbcdsp_init.c | 51 ++++++++ > > >>> 5 files changed, 342 insertions(+) > > >>> create mode 100644 libavcodec/x86/sbcdsp.asm > > >>> create mode 100644 libavcodec/x86/sbcdsp_init.c > > >> > > >> [...] > > >> > > >>> +;******************************************************************* > > >>> +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], > > >>> +; uint32_t scale_factor[2][8], > > >>> +; int blocks, int channels, int subbands) > > >>> +;******************************************************************* > > >>> +INIT_MMX mmx > > >>> +cglobal sbc_calc_scalefactors, 5, 7, 3, sb_sample_f, scale_factor, > > >>> blocks, channels, subbands, ptr, blk > > >>> + ; subbands = 4 * subbands * channels > > >>> + shl subbandsd, 2 > > >>> + cmp channelsd, 2 > > >>> + jl .loop_1 > > >>> + shl subbandsd, 1 > > >>> + > > >>> +.loop_1: > > >>> + sub subbandsq, 8 > > >>> + lea ptrq, [sb_sample_fq + subbandsq] > > >>> + > > >>> + ; blk = (blocks - 1) * 64; > > >>> + lea blkq, [blocksq - 1] > > >>> + shl blkd, 6 > > >>> + > > >>> + movq m0, [scale_mask] > > >> > > >> I insist, this can be easily loaded outside the loop. You have enough > > >> spare regs to store a copy. > > > > > > Oh, I forgot to reply to this. There isn't any register left available > > > on x86_32, hence why I kept those load inside the loop. > > > > You're not using a gprs to store the mask nor need to. You're using mmx > > regs and have 5 left. > > Oh, indeed ! Not sure why it didn't even cross my mind... > I will have a look at this.
Here it is with the scale_mask load out of the loop.
>From 5c8ac4e2d23c0ebf27d6592645d53b60d07d47ef Mon Sep 17 00:00:00 2001 From: Aurelien Jacobs <au...@gnuage.org> Date: Sun, 17 Dec 2017 20:07:33 +0100 Subject: [PATCH 7/9] sbcenc: add MMX optimizations This was originally based on libsbc, and was fully integrated into ffmpeg. Rough speed test: C version: speed= 592x MMX version: speed= 785x --- libavcodec/sbcdsp.c | 3 + libavcodec/sbcdsp.h | 2 + libavcodec/x86/Makefile | 2 + libavcodec/x86/sbcdsp.asm | 285 +++++++++++++++++++++++++++++++++++++++++++ libavcodec/x86/sbcdsp_init.c | 51 ++++++++ 5 files changed, 343 insertions(+) create mode 100644 libavcodec/x86/sbcdsp.asm create mode 100644 libavcodec/x86/sbcdsp_init.c diff --git a/libavcodec/sbcdsp.c b/libavcodec/sbcdsp.c index e155387f0d..2d0addcf28 100644 --- a/libavcodec/sbcdsp.c +++ b/libavcodec/sbcdsp.c @@ -379,4 +379,7 @@ av_cold void ff_sbcdsp_init(SBCDSPContext *s) /* Default implementation for scale factors calculation */ s->sbc_calc_scalefactors = sbc_calc_scalefactors; s->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j; + + if (ARCH_X86) + ff_sbcdsp_init_x86(s); } diff --git a/libavcodec/sbcdsp.h b/libavcodec/sbcdsp.h index 66ed7d324e..127e6a8a11 100644 --- a/libavcodec/sbcdsp.h +++ b/libavcodec/sbcdsp.h @@ -80,4 +80,6 @@ struct sbc_dsp_context { */ void ff_sbcdsp_init(SBCDSPContext *s); +void ff_sbcdsp_init_x86(SBCDSPContext *s); + #endif /* AVCODEC_SBCDSP_H */ diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index a805cd37b4..2350c8bbee 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -63,6 +63,7 @@ OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp_init.o +OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp_init.o OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc_init.o OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp_init.o OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp_init.o @@ -172,6 +173,7 @@ X86ASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o X86ASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o X86ASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv40dsp.o +X86ASM-OBJS-$(CONFIG_SBC_ENCODER) += x86/sbcdsp.o X86ASM-OBJS-$(CONFIG_SVQ1_ENCODER) += x86/svq1enc.o X86ASM-OBJS-$(CONFIG_TAK_DECODER) += x86/takdsp.o X86ASM-OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o diff --git a/libavcodec/x86/sbcdsp.asm b/libavcodec/x86/sbcdsp.asm new file mode 100644 index 0000000000..4e02263a63 --- /dev/null +++ b/libavcodec/x86/sbcdsp.asm @@ -0,0 +1,285 @@ +;****************************************************************************** +;* SIMD optimized SBC encoder DSP functions +;* +;* Copyright (C) 2017 Aurelien Jacobs <au...@gnuage.org> +;* Copyright (C) 2008-2010 Nokia Corporation +;* Copyright (C) 2004-2010 Marcel Holtmann <mar...@holtmann.org> +;* Copyright (C) 2004-2005 Henryk Ploetz <hen...@ploetzli.ch> +;* Copyright (C) 2005-2006 Brad Midgley <bmidg...@xmission.com> +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +scale_mask: times 2 dd 0x8000 ; 1 << (SBC_PROTO_FIXED_SCALE - 1) + +SECTION .text + +;******************************************************************* +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t *consts); +;******************************************************************* +INIT_MMX mmx +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts + movq m0, [inq] + movq m1, [inq+8] + pmaddwd m0, [constsq] + pmaddwd m1, [constsq+8] + paddd m0, [scale_mask] + paddd m1, [scale_mask] + + movq m2, [inq+16] + movq m3, [inq+24] + pmaddwd m2, [constsq+16] + pmaddwd m3, [constsq+24] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+32] + movq m3, [inq+40] + pmaddwd m2, [constsq+32] + pmaddwd m3, [constsq+40] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+48] + movq m3, [inq+56] + pmaddwd m2, [constsq+48] + pmaddwd m3, [constsq+56] + paddd m0, m2 + paddd m1, m3 + + movq m2, [inq+64] + movq m3, [inq+72] + pmaddwd m2, [constsq+64] + pmaddwd m3, [constsq+72] + paddd m0, m2 + paddd m1, m3 + + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE + packssdw m0, m0 + packssdw m1, m1 + + movq m2, m0 + pmaddwd m0, [constsq+80] + pmaddwd m2, [constsq+88] + + movq m3, m1 + pmaddwd m1, [constsq+96] + pmaddwd m3, [constsq+104] + paddd m0, m1 + paddd m2, m3 + + movq [outq ], m0 + movq [outq+8], m2 + + RET + + + +;******************************************************************* +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t *consts); +;******************************************************************* +INIT_MMX mmx +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts + movq m0, [inq] + movq m1, [inq+8] + movq m2, [inq+16] + movq m3, [inq+24] + pmaddwd m0, [constsq] + pmaddwd m1, [constsq+8] + pmaddwd m2, [constsq+16] + pmaddwd m3, [constsq+24] + paddd m0, [scale_mask] + paddd m1, [scale_mask] + paddd m2, [scale_mask] + paddd m3, [scale_mask] + + movq m4, [inq+32] + movq m5, [inq+40] + movq m6, [inq+48] + movq m7, [inq+56] + pmaddwd m4, [constsq+32] + pmaddwd m5, [constsq+40] + pmaddwd m6, [constsq+48] + pmaddwd m7, [constsq+56] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+64] + movq m5, [inq+72] + movq m6, [inq+80] + movq m7, [inq+88] + pmaddwd m4, [constsq+64] + pmaddwd m5, [constsq+72] + pmaddwd m6, [constsq+80] + pmaddwd m7, [constsq+88] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+96] + movq m5, [inq+104] + movq m6, [inq+112] + movq m7, [inq+120] + pmaddwd m4, [constsq+96] + pmaddwd m5, [constsq+104] + pmaddwd m6, [constsq+112] + pmaddwd m7, [constsq+120] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + movq m4, [inq+128] + movq m5, [inq+136] + movq m6, [inq+144] + movq m7, [inq+152] + pmaddwd m4, [constsq+128] + pmaddwd m5, [constsq+136] + pmaddwd m6, [constsq+144] + pmaddwd m7, [constsq+152] + paddd m0, m4 + paddd m1, m5 + paddd m2, m6 + paddd m3, m7 + + psrad m0, 16 ; SBC_PROTO_FIXED_SCALE + psrad m1, 16 ; SBC_PROTO_FIXED_SCALE + psrad m2, 16 ; SBC_PROTO_FIXED_SCALE + psrad m3, 16 ; SBC_PROTO_FIXED_SCALE + + packssdw m0, m0 + packssdw m1, m1 + packssdw m2, m2 + packssdw m3, m3 + + movq m4, m0 + movq m5, m0 + pmaddwd m4, [constsq+160] + pmaddwd m5, [constsq+168] + + movq m6, m1 + movq m7, m1 + pmaddwd m6, [constsq+192] + pmaddwd m7, [constsq+200] + paddd m4, m6 + paddd m5, m7 + + movq m6, m2 + movq m7, m2 + pmaddwd m6, [constsq+224] + pmaddwd m7, [constsq+232] + paddd m4, m6 + paddd m5, m7 + + movq m6, m3 + movq m7, m3 + pmaddwd m6, [constsq+256] + pmaddwd m7, [constsq+264] + paddd m4, m6 + paddd m5, m7 + + movq [outq ], m4 + movq [outq+8], m5 + + movq m5, m0 + pmaddwd m0, [constsq+176] + pmaddwd m5, [constsq+184] + + movq m7, m1 + pmaddwd m1, [constsq+208] + pmaddwd m7, [constsq+216] + paddd m0, m1 + paddd m5, m7 + + movq m7, m2 + pmaddwd m2, [constsq+240] + pmaddwd m7, [constsq+248] + paddd m0, m2 + paddd m5, m7 + + movq m7, m3 + pmaddwd m3, [constsq+272] + pmaddwd m7, [constsq+280] + paddd m0, m3 + paddd m5, m7 + + movq [outq+16], m0 + movq [outq+24], m5 + + RET + + +;******************************************************************* +;void ff_sbc_calc_scalefactors(int32_t sb_sample_f[16][2][8], +; uint32_t scale_factor[2][8], +; int blocks, int channels, int subbands) +;******************************************************************* +INIT_MMX mmx +cglobal sbc_calc_scalefactors, 5, 7, 4, sb_sample_f, scale_factor, blocks, channels, subbands, ptr, blk + ; subbands = 4 * subbands * channels + movq m3, [scale_mask] + shl subbandsd, 2 + cmp channelsd, 2 + jl .loop_1 + shl subbandsd, 1 + +.loop_1: + sub subbandsq, 8 + lea ptrq, [sb_sample_fq + subbandsq] + + ; blk = (blocks - 1) * 64; + lea blkq, [blocksq - 1] + shl blkd, 6 + + movq m0, m3 +.loop_2: + movq m1, [ptrq+blkq] + pxor m2, m2 + pcmpgtd m1, m2 + paddd m1, [ptrq+blkq] + pcmpgtd m2, m1 + pxor m1, m2 + + por m0, m1 + + sub blkq, 64 + jns .loop_2 + + movd blkd, m0 + psrlq m0, 32 + bsr blkd, blkd + sub blkd, 15 ; SCALE_OUT_BITS + mov [scale_factorq + subbandsq], blkd + + movd blkd, m0 + bsr blkd, blkd + sub blkd, 15 ; SCALE_OUT_BITS + mov [scale_factorq + subbandsq + 4], blkd + + cmp subbandsq, 0 + jg .loop_1 + + emms + RET diff --git a/libavcodec/x86/sbcdsp_init.c b/libavcodec/x86/sbcdsp_init.c new file mode 100644 index 0000000000..86effecfdf --- /dev/null +++ b/libavcodec/x86/sbcdsp_init.c @@ -0,0 +1,51 @@ +/* + * Bluetooth low-complexity, subband codec (SBC) + * + * Copyright (C) 2017 Aurelien Jacobs <au...@gnuage.org> + * Copyright (C) 2008-2010 Nokia Corporation + * Copyright (C) 2004-2010 Marcel Holtmann <mar...@holtmann.org> + * Copyright (C) 2004-2005 Henryk Ploetz <hen...@ploetzli.ch> + * Copyright (C) 2005-2006 Brad Midgley <bmidg...@xmission.com> + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** + * @file + * SBC MMX optimization for some basic "building bricks" + */ + +#include "libavutil/cpu.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/sbcdsp.h" + +void ff_sbc_analyze_4_mmx(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_analyze_8_mmx(const int16_t *in, int32_t *out, const int16_t *consts); +void ff_sbc_calc_scalefactors_mmx(int32_t sb_sample_f[16][2][8], + uint32_t scale_factor[2][8], + int blocks, int channels, int subbands); + +av_cold void ff_sbcdsp_init_x86(SBCDSPContext *s) +{ + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_MMX(cpu_flags)) { + s->sbc_analyze_4 = ff_sbc_analyze_4_mmx; + s->sbc_analyze_8 = ff_sbc_analyze_8_mmx; + s->sbc_calc_scalefactors = ff_sbc_calc_scalefactors_mmx; + } +} -- 2.15.1
_______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel