On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly <alanke...@google.com> wrote:
> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. > --- > EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as > discussed in the email thread for part 1 of this patch. > > Benchmark results on Skylake and Haswell: > > Skylake Haswell > hscale_8_to_15_width4_ssse3 761.2 760 > hscale_8_to_15_width4_avx2 468.7 957 > hscale_8_to_15_width8_ssse3 1170.7 1032 > hscale_8_to_15_width8_avx2 865.7 1979 > hscale_8_to_15_width12_ssse3 2172.2 2472 > hscale_8_to_15_width12_avx2 1245.7 2901 > hscale_8_to_15_width16_ssse3 2244.2 2400 > hscale_8_to_15_width16_avx2 1647.2 3681 > > libswscale/swscale_internal.h | 2 + > libswscale/utils.c | 37 +++++++++++ > libswscale/x86/Makefile | 1 + > libswscale/x86/scale_avx2.asm | 112 ++++++++++++++++++++++++++++++++++ > libswscale/x86/swscale.c | 19 ++++++ > tests/checkasm/sw_scale.c | 20 ++++-- > 6 files changed, 186 insertions(+), 5 deletions(-) > create mode 100644 libswscale/x86/scale_avx2.asm > > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h > index 673407636a..fba3dabe5b 100644 > --- a/libswscale/swscale_internal.h > +++ b/libswscale/swscale_internal.h > @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, > yuv2planar1_fn yuv2plane1, yuv2planarX_fn > //number of extra lines to process > #define MAX_LINES_AHEAD 4 > > +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 > +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > diff --git a/libswscale/utils.c b/libswscale/utils.c > index 176fc6fd63..0577fd5490 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { > [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, > }; > > +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > +#if ARCH_X86_64 > + int i, j, k, l; > + int cpu_flags = av_get_cpu_flags(); > + if (EXTERNAL_AVX2_FAST(cpu_flags)){ > + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > + if (dstW % 16 == 0){ > + if (filter != NULL){ > + for (i = 0; i < dstW; i += 8){ > + FFSWAP(int, filterPos[i + 2], filterPos[i+4]); > + FFSWAP(int, filterPos[i + 3], filterPos[i+5]); > + } > + if (filterSize > 4){ > + int16_t *tmp2 = av_malloc(dstW * filterSize * 2); > + memcpy(tmp2, filter, dstW * filterSize * 2); > + for (i = 0; i < dstW; i += 16){//pixel > + for (k = 0; k < filterSize / 4; ++k){//fcoeff > + for (j = 0; j < 16; ++j){//inner pixel > + for (l = 0; l < 4; ++l){//coeff > + int from = i * filterSize + j * > filterSize + k * 4 + l; > + int to = (i) * filterSize + j * 4 > + l + k * 64; > + filter[to] = tmp2[from]; > + } > + } > + } > + } > + av_free(tmp2); > + } > + } > + } > + } > + } > +#endif > +} > + > int sws_isSupportedInput(enum AVPixelFormat pix_fmt) > { > return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? > @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, 0, 0, 0), > get_local_pos(c, 0, 0, 0))) < 0) > goto fail; > + ff_shuffle_filter_coefficients(c, c->hLumFilterPos, > c->hLumFilterSize, c->hLumFilter, dstW); > if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, > &c->hChrFilterSize, c->chrXInc, > c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, > @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, c->chrSrcHSubSample, > c->src_h_chr_pos, 0), > get_local_pos(c, c->chrDstHSubSample, > c->dst_h_chr_pos, 0))) < 0) > goto fail; > + ff_shuffle_filter_coefficients(c, c->hChrFilterPos, > c->hChrFilterSize, c->hChrFilter, c->chrDstW); > } > } // initialize horizontal stuff > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index bfe383364e..68391494be 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o > X86ASM-OBJS += x86/input.o \ > x86/output.o \ > x86/scale.o \ > + x86/scale_avx2.o > \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm > new file mode 100644 > index 0000000000..d90fd2d791 > --- /dev/null > +++ b/libswscale/x86/scale_avx2.asm > @@ -0,0 +1,112 @@ > > +;****************************************************************************** > +;* x86-optimized horizontal line scaling functions > +;* Copyright 2020 Google LLC > +;* Copyright (c) 2011 Ronald S. Bultje <rsbul...@gmail.com> > +;* > +;* This file is part of FFmpeg. > +;* > +;* FFmpeg is free software; you can redistribute it and/or > +;* modify it under the terms of the GNU Lesser General Public > +;* License as published by the Free Software Foundation; either > +;* version 2.1 of the License, or (at your option) any later version. > +;* > +;* FFmpeg is distributed in the hope that it will be useful, > +;* but WITHOUT ANY WARRANTY; without even the implied warranty of > +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > +;* Lesser General Public License for more details. > +;* > +;* You should have received a copy of the GNU Lesser General Public > +;* License along with FFmpeg; if not, write to the Free Software > +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA > 02110-1301 USA > > +;****************************************************************************** > + > +%include "libavutil/x86/x86util.asm" > + > +SECTION_RODATA > + > +swizzle: dd 0, 4, 1, 5, 2, 6, 3, 7 > +four: times 8 dd 4 > + > +SECTION .text > + > > +;----------------------------------------------------------------------------- > +; horizontal line scaling > +; > +; void hscale8to15_<filterSize>_<opt> > +; (SwsContext *c, int16_t *dst, > +; int dstW, const uint8_t *src, > +; const int16_t *filter, > +; const int32_t *filterPos, int filterSize); > +; > +; Scale one horizontal line. Input is 8-bit width Filter is 14 bits. > Output is > +; 15 bits (in int16_t). Each output pixel is generated from $filterSize > input > +; pixels, the position of the first pixel is given in > filterPos[nOutputPixel]. > > +;----------------------------------------------------------------------------- > + > +%macro SCALE_FUNC 1 > +cglobal hscale8to15_%1, 7, 9, 15, pos0, dst, w, srcmem, filter, fltpos, > fltsize, count, inner > + pxor m0, m0 > + movu m15, [swizzle] > + mov countq, $0 > +%ifidn %1, X4 > + movu m14, [four] > + movsxd fltsizeq, fltsized > + shr fltsizeq, 2 > +%endif > +.loop: > + movu m1, [fltposq] > + movu m2, [fltposq+32] > +%ifidn %1, X4 > + pxor m9, m9 > + pxor m10, m10 > + pxor m11, m11 > + pxor m12, m12 > + mov innerq, $0 > +.innerloop: > +%endif > + vpcmpeqd m13, m13 > + vpgatherdd m3,[srcmemq + m1], m13 > + vpcmpeqd m13, m13 > + vpgatherdd m4,[srcmemq + m2], m13 > + vpunpcklbw m5, m3, m0 > + vpunpckhbw m6, m3, m0 > + vpunpcklbw m7, m4, m0 > + vpunpckhbw m8, m4, m0 > + vpmaddwd m5, m5, [filterq] > + vpmaddwd m6, m6, [filterq + 32] > + vpmaddwd m7, m7, [filterq + 64] > + vpmaddwd m8, m8, [filterq + 96] > + add filterq, $80 > +%ifidn %1, X4 > + paddd m9, m5 > + paddd m10, m6 > + paddd m11, m7 > + paddd m12, m8 > + paddd m1, m14 > + paddd m2, m14 > + add innerq, $1 > + cmp innerq, fltsizeq > + jl .innerloop > + vphaddd m5, m9, m10 > + vphaddd m6, m11, m12 > +%else > + vphaddd m5, m5, m6 > + vphaddd m6, m7, m8 > +%endif > + vpsrad m5, 7 > + vpsrad m6, 7 > + vpackssdw m5, m5, m6 > + vpermd m5, m15, m5 > + vmovdqu [dstq + countq * 2], m5 > + add fltposq, $40 > + add countq, $10 > + cmp countq, wq > + jl .loop > +REP_RET > +%endmacro > + > +%if ARCH_X86_64 > +INIT_YMM avx2 > +SCALE_FUNC 4 > +SCALE_FUNC X4 > +%endif > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 0848a31461..164b06d6ba 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -276,6 +276,9 @@ SCALE_FUNCS_SSE(sse2); > SCALE_FUNCS_SSE(ssse3); > SCALE_FUNCS_SSE(sse4); > > +SCALE_FUNC(4, 8, 15, avx2); > +SCALE_FUNC(X4, 8, 15, avx2); > + > #define VSCALEX_FUNC(size, opt) \ > void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int > filterSize, \ > const int16_t **src, uint8_t > *dest, int dstW, \ > @@ -568,6 +571,22 @@ switch(c->dstBpc){ \ > } > > #if ARCH_X86_64 > +#define ASSIGN_AVX2_SCALE_FUNC(hscalefn, filtersize) \ > + switch (filtersize) { \ > + case 4: hscalefn = ff_hscale8to15_4_avx2; break; \ > + default: hscalefn = ff_hscale8to15_X4_avx2; break; \ > + break; \ > + } > + > + if (EXTERNAL_AVX2_FAST(cpu_flags)){ > + if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > + if(c->chrDstW % 16 == 0) > + ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > + if(c->dstW % 16 == 0) > + ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > + } > + } > + > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > switch (c->dstFormat) { > case AV_PIX_FMT_NV12: > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index 40c5eb3aa8..103b1aa5da 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -135,13 +135,13 @@ static void check_yuv2yuvX(void) > } > > #undef SRC_PIXELS > -#define SRC_PIXELS 128 > +#define SRC_PIXELS 512 > > static void check_hscale(void) > { > #define MAX_FILTER_WIDTH 40 > -#define FILTER_SIZES 5 > - static const int filter_sizes[FILTER_SIZES] = { 4, 8, 16, 32, 40 }; > +#define FILTER_SIZES 6 > + static const int filter_sizes[FILTER_SIZES] = { 4, 8, 12, 16, 32, 40 > }; > > #define HSCALE_PAIRS 2 > static const int hscale_pairs[HSCALE_PAIRS][2] = { > @@ -160,6 +160,8 @@ static void check_hscale(void) > // padded > LOCAL_ALIGNED_32(int16_t, filter, [SRC_PIXELS * MAX_FILTER_WIDTH + > MAX_FILTER_WIDTH]); > LOCAL_ALIGNED_32(int32_t, filterPos, [SRC_PIXELS]); > + LOCAL_ALIGNED_32(int16_t, filterAvx2, [SRC_PIXELS * MAX_FILTER_WIDTH > + MAX_FILTER_WIDTH]); > + LOCAL_ALIGNED_32(int32_t, filterPosAvx, [SRC_PIXELS]); > > // The dst parameter here is either int16_t or int32_t but we use > void* to > // just cover both cases. > @@ -167,6 +169,8 @@ static void check_hscale(void) > const uint8_t *src, const int16_t *filter, > const int32_t *filterPos, int filterSize); > > + int cpu_flags = av_get_cpu_flags(); > + > ctx = sws_alloc_context(); > if (sws_init_context(ctx, NULL, NULL) < 0) > fail(); > @@ -180,9 +184,11 @@ static void check_hscale(void) > ctx->srcBpc = hscale_pairs[hpi][0]; > ctx->dstBpc = hscale_pairs[hpi][1]; > ctx->hLumFilterSize = ctx->hChrFilterSize = width; > + ctx->dstW = ctx->chrDstW = SRC_PIXELS; > > for (i = 0; i < SRC_PIXELS; i++) { > filterPos[i] = i; > + filterPosAvx[i] = i; > > // These filter cofficients are chosen to try break two > corner > // cases, namely: > @@ -211,16 +217,20 @@ static void check_hscale(void) > filter[SRC_PIXELS * width + i] = rnd(); > } > ff_sws_init_scale(ctx); > + memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * > MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > + if (cpu_flags & AV_CPU_FLAG_AVX2){ > + ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, > filterAvx2, SRC_PIXELS); > + } > > if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", > ctx->srcBpc, ctx->dstBpc + 1, width)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > memset(dst1, 0, SRC_PIXELS * sizeof(dst1[0])); > > call_ref(NULL, dst0, SRC_PIXELS, src, filter, filterPos, > width); > - call_new(NULL, dst1, SRC_PIXELS, src, filter, filterPos, > width); > + call_new(NULL, dst1, SRC_PIXELS, src, filterAvx2, > filterPosAvx, width); > if (memcmp(dst0, dst1, SRC_PIXELS * sizeof(dst0[0]))) > fail(); > - bench_new(NULL, dst0, SRC_PIXELS, src, filter, filterPos, > width); > + bench_new(NULL, dst0, SRC_PIXELS, src, filter, > filterPosAvx, width); > } > } > } > -- > 2.32.0.402.g57bb445576-goog > > Part 1 of this patch has been abandoned as it is no longer required. Are there any further comments on this patch or can it be merged? Thanks _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".