Speed of ffmpeg when decoding a 720p yuv422p10 file encoded with the relevant transform. C: 119fps SSE2: 204fps AVX: 206fps AVX2: 221fps
timer measurements, haar horizontal compose: sse2: 3.68x faster (45143 vs. 12279 decicycles) compared with C avx: 3.68x faster (45143 vs. 12275 decicycles) compared with C avx2: 5.16x faster (45143 vs. 8742 decicycles) compared with C haar vertical compose: sse2: 1.64x faster (31792 vs. 19377 decicycles) compared with C avx: 1.58x faster (31792 vs. 20090 decicycles) compared with C avx2: 1.66x faster (31792 vs. 19157 decicycles) compared with C --- libavcodec/dirac_dwt.c | 7 +- libavcodec/dirac_dwt.h | 1 + libavcodec/x86/Makefile | 6 +- libavcodec/x86/dirac_dwt_10bit.asm | 160 ++++++++++++++++++++++++++ libavcodec/x86/dirac_dwt_init_10bit.c | 76 ++++++++++++ 5 files changed, 247 insertions(+), 3 deletions(-) create mode 100644 libavcodec/x86/dirac_dwt_10bit.asm create mode 100644 libavcodec/x86/dirac_dwt_init_10bit.c diff --git a/libavcodec/dirac_dwt.c b/libavcodec/dirac_dwt.c index cc08f8865a..86bee5bb9b 100644 --- a/libavcodec/dirac_dwt.c +++ b/libavcodec/dirac_dwt.c @@ -59,8 +59,13 @@ int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, return AVERROR_INVALIDDATA; } - if (ARCH_X86 && bit_depth == 8) +#if ARCH_X86 + if (bit_depth == 8) ff_spatial_idwt_init_x86(d, type); + else if (bit_depth == 10) + ff_spatial_idwt_init_10bit_x86(d, type); +#endif + return 0; } diff --git a/libavcodec/dirac_dwt.h b/libavcodec/dirac_dwt.h index 994dc21d70..1ad7b9a821 100644 --- a/libavcodec/dirac_dwt.h +++ b/libavcodec/dirac_dwt.h @@ -88,6 +88,7 @@ enum dwt_type { int ff_spatial_idwt_init(DWTContext *d, DWTPlane *p, enum dwt_type type, int decomposition_count, int bit_depth); void ff_spatial_idwt_init_x86(DWTContext *d, enum dwt_type type); +void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type); void ff_spatial_idwt_slice2(DWTContext *d, int y); diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 2350c8bbee..590d83c167 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -7,7 +7,8 @@ OBJS-$(CONFIG_BLOCKDSP) += x86/blockdsp_init.o OBJS-$(CONFIG_BSWAPDSP) += x86/bswapdsp_init.o OBJS-$(CONFIG_DCT) += x86/dct_init.o OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_init.o \ - x86/dirac_dwt_init.o + x86/dirac_dwt_init.o \ + x86/dirac_dwt_init_10bit.o OBJS-$(CONFIG_FDCTDSP) += x86/fdctdsp_init.o OBJS-$(CONFIG_FFT) += x86/fft_init.o OBJS-$(CONFIG_FLACDSP) += x86/flacdsp_init.o @@ -153,7 +154,8 @@ X86ASM-OBJS-$(CONFIG_APNG_DECODER) += x86/pngdsp.o X86ASM-OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsidct.o X86ASM-OBJS-$(CONFIG_DCA_DECODER) += x86/dcadsp.o x86/synth_filter.o X86ASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp.o \ - x86/dirac_dwt.o + x86/dirac_dwt.o \ + x86/dirac_dwt_10bit.o X86ASM-OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o X86ASM-OBJS-$(CONFIG_EXR_DECODER) += x86/exrdsp.o X86ASM-OBJS-$(CONFIG_FLAC_DECODER) += x86/flacdsp.o diff --git a/libavcodec/x86/dirac_dwt_10bit.asm b/libavcodec/x86/dirac_dwt_10bit.asm new file mode 100644 index 0000000000..baea91329e --- /dev/null +++ b/libavcodec/x86/dirac_dwt_10bit.asm @@ -0,0 +1,160 @@ +;****************************************************************************** +;* x86 optimized discrete 10-bit wavelet trasnform +;* Copyright (c) 2018 James Darnley +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pd_1 + +SECTION .text + +%macro HAAR_VERTICAL 0 + +cglobal vertical_compose_haar_10bit, 3, 6, 4, b0, b1, w + DECLARE_REG_TMP 4,5 + + mova m2, [pd_1] + mov r3d, wd + and wd, ~(mmsize/4 - 1) + shl wd, 2 + add b0q, wq + add b1q, wq + neg wq + + ALIGN 16 + .loop_simd: + mova m0, [b0q + wq] + mova m1, [b1q + wq] + paddd m3, m1, m2 + psrad m3, 1 + psubd m0, m3 + paddd m1, m0 + mova [b0q + wq], m0 + mova [b1q + wq], m1 + add wq, mmsize + jl .loop_simd + + and r3d, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [b0q] + mov t1d, [b1q] + mov r2d, t1d + add r2d, 1 + sar r2d, 1 + sub t0d, r2d + add t1d, t0d + mov [b0q], t0d + mov [b1q], t1d + + add b0q, 4 + add b1q, 4 + sub r3d, 1 + jg .loop_scalar + + .end: +RET + +%endmacro + +%macro HAAR_HORIZONTAL 0 + +cglobal horizontal_compose_haar_10bit, 3, 6+ARCH_X86_64, 4, b, temp_, w, x, b2 + DECLARE_REG_TMP 2,5 + %if ARCH_X86_64 + %define tail r6d + %else + %define tail dword wm + %endif + + mova m2, [pd_1] + xor xd, xd + shr wd, 1 + mov tail, wd + lea b2q, [bq + 4*wq] + + ALIGN 16 + .loop_lo: + mova m0, [bq + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m2 + psrad m1, 1 + psubd m0, m1 + mova [temp_q + 4*xq], m0 + add xd, mmsize/4 + cmp xd, wd + jl .loop_lo + + xor xd, xd + and wd, ~(mmsize/4 - 1) + + ALIGN 16 + .loop_hi: + mova m0, [temp_q + 4*xq] + movu m1, [b2q + 4*xq] + paddd m1, m0 + paddd m0, m2 + paddd m1, m2 + psrad m0, 1 + psrad m1, 1 + SBUTTERFLY dq, 0,1,3 + %if cpuflag(avx2) + SBUTTERFLY dqqq, 0,1,3 + %endif + mova [bq + 8*xq], m0 + mova [bq + 8*xq + mmsize], m1 + add xd, mmsize/4 + cmp xd, wd + jl .loop_hi + + and tail, mmsize/4 - 1 + jz .end + .loop_scalar: + mov t0d, [temp_q + 4*xq] + mov t1d, [b2q + 4*xq] + add t1d, t0d + add t0d, 1 + add t1d, 1 + sar t0d, 1 + sar t1d, 1 + mov [bq + 8*xq], t0d + mov [bq + 8*xq + 4], t1d + add xq, 1 + sub tail, 1 + jg .loop_scalar + + .end: +REP_RET + +%endmacro + +INIT_XMM sse2 +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_XMM avx +HAAR_HORIZONTAL +HAAR_VERTICAL + +INIT_YMM avx2 +HAAR_HORIZONTAL +HAAR_VERTICAL diff --git a/libavcodec/x86/dirac_dwt_init_10bit.c b/libavcodec/x86/dirac_dwt_init_10bit.c new file mode 100644 index 0000000000..289862d728 --- /dev/null +++ b/libavcodec/x86/dirac_dwt_init_10bit.c @@ -0,0 +1,76 @@ +/* + * x86 optimized discrete wavelet transform + * Copyright (c) 2018 James Darnley + * + * This file is part of FFmpeg. + * + * FFmpeg is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * FFmpeg is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with FFmpeg; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/x86/asm.h" +#include "libavutil/x86/cpu.h" +#include "libavcodec/dirac_dwt.h" + +void ff_horizontal_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_horizontal_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +void ff_vertical_compose_haar_10bit_sse2(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx(int32_t *b0, int32_t *b1, int width_align); +void ff_vertical_compose_haar_10bit_avx2(int32_t *b0, int32_t *b1, int width_align); + +av_cold void ff_spatial_idwt_init_10bit_x86(DWTContext *d, enum dwt_type type) +{ +#if HAVE_X86ASM + int cpu_flags = av_get_cpu_flags(); + + if (EXTERNAL_SSE2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_sse2; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_sse2; + break; + } + } + + if (EXTERNAL_AVX(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx; + break; + } + } + + if (EXTERNAL_AVX2(cpu_flags)) { + switch (type) { + case DWT_DIRAC_HAAR0: + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2; + break; + case DWT_DIRAC_HAAR1: + d->horizontal_compose = (void*)ff_horizontal_compose_haar_10bit_avx2; + d->vertical_compose = (void*)ff_vertical_compose_haar_10bit_avx2; + break; + } + } + +#endif // HAVE_X86ASM +} -- 2.18.0 _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel