Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 119fps SSE2: 204fps AVX: 206fps AVX2: 221fps --- libavcodec/dirac_dwt.c | 7 +- libavcodec/dirac_dwt.h | 1 + libavcodec/x86/Makefile | 6 +- libavcodec/x86/dirac_dwt_10bit.asm | 113 +++++++++++++++++++++ libavcodec/x86/dirac_dwt_init_10bit.c | 136 ++++++++++++++++++++++++++ 5 files changed, 260 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c
diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c index cc08f8865a..86bee5bb9b 100644 --- a/libavcodec/dirac_dwt.c +++ b/libavcodec/dirac_dwt.c @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, return AVERROR_INVALIDDATA; } - if (ARCH_X86 && bit_depth == 8) +#if ARCH_X86 + if (bit_depth == 8) ff_spatial_idwt_init_x86(d, type); + else if (bit_depth == 10) + ff_spatial_idwt_init_10bit_x86(d, type); +#endif + return 0; } diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h index 994dc21d70..1ad7b9a821 100644 --- a/libavcodec/dirac_dwt.h +++ b/libavcodec/dirac_dwt.h @@ -88,6 +88,7 @@ enum dwt_type { int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, int decomposition_count, int bit_depth); void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); void ff_spatial_idwt_slice2(DWTContext *d, int y); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..590d83c167 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ - x86/dirac_dwt_init.o + x86/dirac_dwt_init.o \ + x86/dirac_dwt_init_10bit.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ - x86/dirac_dwt.o + x86/dirac_dwt.o \ + x86/dirac_dwt_10bit.o X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm new file mode 100644 index 0000000000..dc3830615e --- /dev/null +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -0,0 +1,113 @@ +;****************************************************************************** +;* x86 optimized discrete 10-bit wavelet trasnform +;* Copyright (c) 2018 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pd_1 + +SECTION .text + +%macro HAAR_VERTICAL 0 + +cglobal vertical_compose_haar_10bit, 3, 3, 4, b0, b1, w + mova m2, [pd_1] + shl wd, 2 + add b0q, wq + add b1q, wq + neg wq + + ALIGN 16 + .loop: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + paddd m3, m1, m2 + psrad m3, 1 + psubd m0, m3 + paddd m1, m0 + mova [b0q + wq], m0 + mova [b1q + wq], m1 + add wq, mmsize + jl .loop +RET + +%endmacro + +%macro HAAR_HORIZONTAL 0 + +cglobal horizontal_compose_haar_10bit, 3, 6, 4, b, temp_, w, x, b2 + mova m2, [pd_1] + xor xd, xd + shr wd, 1 + lea b2q, [bq + 4*wq] + + ALIGN 16 + .loop_lo: + mova m0, [bq + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m2 + psrad m1, 1 + psubd m0, m1 + mova [temp_q + 4*xq], m0 + add xd, mmsize/4 + cmp xd, wd + jl .loop_lo + + xor xd, xd + and wd, ~(mmsize/4 - 1) + cmp wd, mmsize/4 + jl .end + + ALIGN 16 + .loop_hi: + mova m0, [temp_q + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m0 + paddd m0, m2 + paddd m1, m2 + psrad m0, 1 + psrad m1, 1 + SBUTTERFLY dq, 0,1,3 + %if cpuflag(avx2) + SBUTTERFLY dqqq, 0,1,3 + %endif + mova [bq + 8*xq], m0 + mova [bq + 8*xq + mmsize], m1 + add xd, mmsize/4 + cmp xd, wd + jl .loop_hi + .end: +REP_RET + +%endmacro + +INIT_XMM sse2 +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_XMM avx +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_YMM avx2 +HAAR_HORIZONTAL +HAAR_VERTICAL diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c new file mode 100644 index 0000000000..939950e3ff --- /dev/null +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -0,0 +1,136 @@ +/* + * x86 optimized discrete wavelet transform + * Copyright (c) 2018 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dirac_dwt.h" + +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +static void vertical_compose_haar_sse2(int32_t *b0, int32_t *b1, int width) +{ + int i, width_align = width & ~3; + ff_vertical_compose_haar_10bit_sse2(b0, b1, width_align); + for(i=width_align; i<width; i++) { + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); + } +} + +static void vertical_compose_haar_avx(int32_t *b0, int32_t *b1, int width) +{ + int i, width_align = width & ~3; + ff_vertical_compose_haar_10bit_avx(b0, b1, width_align); + for(i=width_align; i<width; i++) { + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); + } +} + +static void vertical_compose_haar_avx2(int32_t *b0, int32_t *b1, int width) +{ + int i, width_align = width & ~7; + ff_vertical_compose_haar_10bit_avx2(b0, b1, width_align); + for(i=width_align; i<width; i++) { + b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); + b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); + } +} + +static void horizontal_compose_haar_sse2(int32_t *b, int32_t *tmp, int width) +{ + int i = width/2 & ~3; + ff_horizontal_compose_haar_10bit_sse2(b, tmp, width); + for (; i < width/2; i++) { + b[2*i ] = (tmp[i] + 1) >> 1; + b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1; + } +} + +static void horizontal_compose_haar_avx(int32_t *b, int32_t *tmp, int width) +{ + int i = width/2 & ~3; + ff_horizontal_compose_haar_10bit_avx(b, tmp, width); + for (; i < width/2; i++) { + b[2*i ] = (tmp[i] + 1) >> 1; + b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1; + } +} + +static void horizontal_compose_haar_avx2(int32_t *b, int32_t *tmp, int width) +{ + int i = width/2 & ~7; + ff_horizontal_compose_haar_10bit_avx2(b, tmp, width); + for (; i < width/2; i++) { + b[2*i ] = (tmp[i] + 1) >> 1; + b[2*i+1] = (COMPOSE_HAARiH0(b[i + width/2], tmp[i]) + 1) >> 1; + } +} + +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_sse2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)horizontal_compose_haar_sse2; + d->vertical_compose = (void*)vertical_compose_haar_sse2; + break; + } + } + + if (EXTERNAL_AVX(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_avx; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)horizontal_compose_haar_avx; + d->vertical_compose = (void*)vertical_compose_haar_avx; + break; + } + } + + if (EXTERNAL_AVX2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)vertical_compose_haar_avx2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)horizontal_compose_haar_avx2; + d->vertical_compose = (void*)vertical_compose_haar_avx2; + break; + } + } + +#endif // HAVE_X86ASM +} -- 2.17.1 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel