Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Ping! On Thu, Jan 14, 2021 at 3:47 PM Alan Kelly wrote: > --- > Replaces cpuflag(mmx) with notcpuflag(sse3) for store macro > Tests for multiple sizes in checkasm-sw_scale > checkasm-sw_scale aligns memory on 8 bytes instad of 32 to catch aligned > loads > libswscale/x86/Makefile | 1 + > libswscale/x86/swscale.c | 130 > libswscale/x86/swscale_template.c | 82 -- > libswscale/x86/yuv2yuvX.asm | 136 ++ > tests/checkasm/sw_scale.c | 103 ++ > 5 files changed, 294 insertions(+), 158 deletions(-) > create mode 100644 libswscale/x86/yuv2yuvX.asm > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index 831d5359aa..bfe383364e 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o > \ > x86/scale.o \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > + x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 15c0b22f20..3df193a067 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) > = 0x8080808080808080ULL; > DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= > 0x0001000100010001ULL; > > > +#define YUV2YUVX_FUNC_DECL(opt) \ > +static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const > int16_t **src, \ > + uint8_t *dest, int dstW, \ > + const uint8_t *dither, int offset); \ > + > +YUV2YUVX_FUNC_DECL(mmx) > +YUV2YUVX_FUNC_DECL(mmxext) > +YUV2YUVX_FUNC_DECL(sse3) > +YUV2YUVX_FUNC_DECL(avx2) > + > //MMX versions > #if HAVE_MMX_INLINE > #undef RENAME > @@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int > dstY) > } > > #if HAVE_MMXEXT > -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, > - const int16_t **src, uint8_t *dest, int dstW, > - const uint8_t *dither, int offset) > -{ > -if(((uintptr_t)dest) & 15){ > -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); > -return; > -} > -filterSize--; > -#define MAIN_FUNCTION \ > -"pxor %%xmm0, %%xmm0 \n\t" \ > -"punpcklbw %%xmm0, %%xmm3 \n\t" \ > -"movd %4, %%xmm1 \n\t" \ > -"punpcklwd %%xmm1, %%xmm1 \n\t" \ > -"punpckldq %%xmm1, %%xmm1 \n\t" \ > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ > -"psllw $3, %%xmm1 \n\t" \ > -"paddw %%xmm1, %%xmm3 \n\t" \ > -"psraw $4, %%xmm3 \n\t" \ > -"movdqa %%xmm3, %%xmm4 \n\t" \ > -"movdqa %%xmm3, %%xmm7 \n\t" \ > -"movl %3, %%ecx \n\t" \ > -"mov %0, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -".p2align 4 \n\t" /* > FIXME Unroll? */\ > -"1: \n\t"\ > -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* > filterCoeff */\ > -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 > \n\t" /* srcData */\ > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 > \n\t" /* srcData */\ > -"add$16, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -"test %%"FF_REG_S", %%"FF_REG_S" > \n\t"\ > -"pmulhw %%xmm0, %%xmm2 \n\t"\ > -"pmulhw %%xmm0, %%xmm5 \n\t"\ > -"paddw%%xmm2, %%xmm3 \n\t"\ > -"paddw%%xmm5, %%xmm4 \n\t"\ > -" jnz
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Looks like there are no comments, is this OK to be applied? Thanks On Tue, Feb 9, 2021 at 6:25 PM Paul B Mahol wrote: > Will apply in no comments. > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_scale.c
Initialises each item in src and filter arrays to fix valgrind uninitialised value warning. --- tests/checkasm/sw_scale.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 7504f8b45f..a4866723d7 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -86,8 +86,10 @@ static void check_yuv2yuvX(void) uint16_t coeff[8]; } *vFilterData; uint8_t d_val = rnd(); -randomize_buffers(filter_coeff, LARGEST_FILTER); -randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE); +for(i = 0; i < LARGEST_FILTER * LARGEST_INPUT_SIZE; ++i) + src_pixels[i] = rnd(); +for(i = 0; i < LARGEST_FILTER; ++i) + filter_coeff[i] = rnd(); ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] tests/checkasm/sw_scale.c
Checks av_mallocs --- tests/checkasm/sw_scale.c | 4 1 file changed, 4 insertions(+) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index a4866723d7..ef414c0a82 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -103,7 +103,11 @@ static void check_yuv2yuvX(void) for(osi = 0; osi < 64; osi += 16){ for(fsi = 0; fsi < FILTER_SIZES; ++fsi){ src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]); +if(!src) + fail(); vFilterData = av_malloc((filter_sizes[fsi] + 2) * sizeof(union VFilterData)); +if(!vFilterData) + fail(); memset(vFilterData, 0, (filter_sizes[fsi] + 2) * sizeof(union VFilterData)); for(i = 0; i < filter_sizes[fsi]; ++i){ src[i] = &src_pixels[i * LARGEST_INPUT_SIZE]; -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Thanks James for spotting this. I have sent two patches fixing the valgrind error from checkasm and the unchecked av_mallocs. I do not believe that the two remaining valgrind errors come from my patch, although I may be mistaken. Using git bisect, I have identified b94cd55155d8c061f1e1faca9076afe540149c27 as the problematic commit. On Thu, Feb 18, 2021 at 11:23 PM James Almer wrote: > On 2/17/2021 5:24 PM, Paul B Mahol wrote: > > On Tue, Feb 16, 2021 at 6:31 PM Alan Kelly < > > alankelly-at-google@ffmpeg.org> wrote: > > > >> Looks like there are no comments, is this OK to be applied? Thanks > >> > > > > Applied, thanks for pinging. > > Valgrind complains about this change. The checkasm test specifically. > > > http://fate.ffmpeg.org/report.cgi?time=20210218014903&slot=x86_64-archlinux-gcc-valgrind > > I also noticed it has a bunch of unchecked av_mallocs(). > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_scale.c
Initialises each item in src and filter arrays to fix valgrind uninitialised value warning. --- casts pointers to uint8_t* and multiplies the buffer size by sizeof(uint16_t). tests/checkasm/sw_scale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 7504f8b45f..e3bedd57c6 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -86,8 +86,8 @@ static void check_yuv2yuvX(void) uint16_t coeff[8]; } *vFilterData; uint8_t d_val = rnd(); -randomize_buffers(filter_coeff, LARGEST_FILTER); -randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE); +randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof(uint16_t)); +randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * sizeof(uint16_t)); ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0
--- libswscale/x86/swscale.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 1e865914cb..71961a9ae0 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -206,7 +206,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +if(dstW > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ return; \ } @@ -224,7 +225,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ +if(pixelsProcessed > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX
--- tests/checkasm/sw_scale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index a10118704b..3ac0f9082f 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -68,8 +68,8 @@ static void check_yuv2yuvX(void) #define FILTER_SIZES 4 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16}; #define LARGEST_INPUT_SIZE 512 -#define INPUT_SIZES 4 -static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512}; +#define INPUT_SIZES 6 +static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512}; declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, @@ -107,7 +107,7 @@ static void check_yuv2yuvX(void) for(j = 0; j < 4; ++j) vFilterData[i].coeff[j + 4] = filter_coeff[i]; } -if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", filter_sizes[fsi], osi)){ +if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", filter_sizes[fsi], osi, dstW)){ memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0])); memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0])); -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext
--- This is so that tails of size 8 may safely be processed libswscale/x86/yuv2yuvX.asm | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddwm3, m3, m2 paddwm4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddwm6, m6, m2 paddwm1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psrawm3, m3, 3 psrawm4, m4, 3 +%if cpuflag(sse3) psrawm6, m6, 3 psrawm1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 -add offsetq, mmsize * 2 +%endif +add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop -- 2.30.0.617.g56c4b15f3c-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0
--- libswscale/x86/swscale.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index cc9e8b0155..0848a31461 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +if(dstW > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ return; \ } @@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ +if(pixelsProcessed > 0) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ -- 2.31.0.291.g576ba9dcdaf-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX
--- tests/checkasm/sw_scale.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index a10118704b..3ac0f9082f 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -68,8 +68,8 @@ static void check_yuv2yuvX(void) #define FILTER_SIZES 4 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16}; #define LARGEST_INPUT_SIZE 512 -#define INPUT_SIZES 4 -static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512}; +#define INPUT_SIZES 6 +static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512}; declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, @@ -107,7 +107,7 @@ static void check_yuv2yuvX(void) for(j = 0; j < 4; ++j) vFilterData[i].coeff[j + 4] = filter_coeff[i]; } -if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", filter_sizes[fsi], osi)){ +if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", filter_sizes[fsi], osi, dstW)){ memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0])); memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0])); -- 2.31.0.291.g576ba9dcdaf-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext
--- This is so that inputs of size 8 are supported, as was the case with the original implementation. A bug was found with inputs not divisible by 16. libswscale/x86/yuv2yuvX.asm | 14 +- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 521880dabe..b6294cb919 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -37,8 +37,10 @@ SECTION .text cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %if notcpuflag(sse3) %define movr mova +%define unroll 1 %else %define movr movdqu +%define unroll 2 %endif movsxdifnidn dstWq, dstWd movsxdifnidn offsetq, offsetd @@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset .outerloop: mova m4, m7 mova m3, m7 +%if cpuflag(sse3) mova m6, m7 mova m1, m7 +%endif .loop: %if cpuflag(avx2) vpbroadcastq m0, [filterSizeq + 8] @@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] paddwm3, m3, m2 paddwm4, m4, m5 +%if cpuflag(sse3) pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] paddwm6, m6, m2 paddwm1, m1, m5 +%endif add filterSizeq, $10 mov srcq, [filterSizeq] test srcq, srcq jnz .loop psrawm3, m3, 3 psrawm4, m4, 3 +%if cpuflag(sse3) psrawm6, m6, 3 psrawm1, m1, 3 +%endif packuswb m3, m3, m4 +%if cpuflag(sse3) packuswb m6, m6, m1 +%endif mov srcq, [filterq] %if cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif movr [destq + offsetq], m3 +%if cpuflag(sse3) movr [destq + offsetq + mmsize], m6 -add offsetq, mmsize * 2 +%endif +add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq jb .outerloop -- 2.31.0.291.g576ba9dcdaf-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction
Broadwell and later have fast gather instructions. --- This is so that the avx2 version of ff_hscale8to15X which uses gather instructions is only selected on machines where it will actually be faster. libavutil/cpu.c | 6 ++ libavutil/cpu.h | 6 ++ libavutil/cpu_internal.h | 1 + libavutil/x86/cpu.c | 18 ++ 4 files changed, 31 insertions(+) diff --git a/libavutil/cpu.c b/libavutil/cpu.c index 8960415d00..0a723eeb7a 100644 --- a/libavutil/cpu.c +++ b/libavutil/cpu.c @@ -49,6 +49,12 @@ static atomic_int cpu_flags = ATOMIC_VAR_INIT(-1); +int av_cpu_has_fast_gather(void){ +if (ARCH_X86) +return ff_cpu_has_fast_gather(); +return 0; +} + static int get_cpu_flags(void) { if (ARCH_MIPS) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index b555422dae..faf3a221f4 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -72,6 +72,7 @@ #define AV_CPU_FLAG_MMI (1 << 0) #define AV_CPU_FLAG_MSA (1 << 1) +int av_cpu_has_fast_gather(void); /** * Return the flags which specify extensions supported by the CPU. * The returned value is affected by av_force_cpu_flags() if that was used @@ -107,6 +108,11 @@ int av_cpu_count(void); * av_set_cpu_flags_mask(), then this function will behave as if AVX is not * present. */ + +/** + * Returns true if the cpu has fast gather instructions. + * Broadwell and later cpus have fast gather + */ size_t av_cpu_max_align(void); #endif /* AVUTIL_CPU_H */ diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h index 889764320b..92525df0c1 100644 --- a/libavutil/cpu_internal.h +++ b/libavutil/cpu_internal.h @@ -46,6 +46,7 @@ int ff_get_cpu_flags_aarch64(void); int ff_get_cpu_flags_arm(void); int ff_get_cpu_flags_ppc(void); int ff_get_cpu_flags_x86(void); +int ff_cpu_has_fast_gather(void); size_t ff_get_cpu_max_align_mips(void); size_t ff_get_cpu_max_align_aarch64(void); diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..9724e0017b 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -270,3 +270,21 @@ size_t ff_get_cpu_max_align_x86(void) return 8; } + +int ff_cpu_has_fast_gather(void){ +int eax, ebx, ecx; +int max_std_level, std_caps = 0; +int family = 0, model = 0; +cpuid(0, max_std_level, ebx, ecx, std_caps); + +if (max_std_level >= 1) { +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +// Broadwell and later +if(family == 6 && model >= 70){ + return 1; +} +} +return 0; +} -- 2.32.0.272.g935e593368-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. --- libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 +++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 ++ tests/checkasm/sw_scale.c | 21 +-- 6 files changed, 187 insertions(+), 5 deletions(-) create mode 100644 libswscale/x86/scale_avx2.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index a1de95cee0..45ef657cd4 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1056,4 +1056,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index 6bac7b658d..0dc1f7df7f 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -267,6 +267,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST(cpu_flags) && av_cpu_has_fast_gather()){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1697,6 +1732,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1706,6 +1742,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm new file mode 100644 index 00..d90fd2d791 --- /dev/null +++ b/libswscale/x86/scale_avx2.asm @@ -0,0 +1,112 @@ +;**
Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction
Hi, Sorry for the late reply, busy oncall week. Thanks for your responses. I have looked at the code for cpuflags and what you suggested makes sense. I just have a question about naming. EXTERNAL_AVX2_FAST is already used in many places - it checks whether the flag AV_CPU_FLAG_AVXSLOW is set so I can't use this as it would change the meaning of it. Could I define a flag like for AV_CPU_FLAG_CMOV? AV_CPU_FLAG_FAST_GATHER or similar? Or could you please suggest a better solution. Thanks On Mon, Jun 14, 2021 at 2:17 PM James Almer wrote: > On 6/14/2021 8:53 AM, Ronald S. Bultje wrote: > > Hi Alan, > > > > On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly < > > alankelly-at-google@ffmpeg.org> wrote: > > > >> Broadwell and later have fast gather instructions. > >> --- > >> This is so that the avx2 version of ff_hscale8to15X which uses gather > >> instructions is only selected on machines where it will actually be > >> faster. > >> > > > > We've in the past typically done this with a bit in the cpuflags return > > value. Can this be added there instead of being its own function? > > > > Also, what is the cycle count of ssse3/avx2 implementation for this > > specific function on Haswell? It would be good to note that in the > > respective patch so that we understand why the check was added. > > Between 9 and 12 on Haswell, 5 to 7 on Broadwell, and about 2 to 5 on > Skylake and newer, acording to Agner's pdf if i'm reading it right. It's > also slow on AMD before Zen 3. > > And yes, this should if anything be a new cpu flag and not a new function. > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
Broadwell and later and Zen3 and later have fast gather instructions. --- Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell, and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3. libavutil/cpu.h | 2 ++ libavutil/x86/cpu.c | 18 -- libavutil/x86/cpu.h | 1 + 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index b555422dae..f94eb79af1 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -50,6 +50,7 @@ #define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions #define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction #define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used +#define AV_CPU_FLAG_AVX2SLOW 0x200 ///< AVX2 supported but gather is slower. #define AV_CPU_FLAG_FMA30x1 ///< Haswell FMA3 functions #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 @@ -107,6 +108,7 @@ int av_cpu_count(void); * av_set_cpu_flags_mask(), then this function will behave as if AVX is not * present. */ + size_t av_cpu_max_align(void); #endif /* AVUTIL_CPU_H */ diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..56fcde594c 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,20 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){ rval |= AV_CPU_FLAG_AVX2; + +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +// Haswell and earlier has slow gather +if(family == 6 && model < 70) +rval |= AV_CPU_FLAG_AVX2SLOW; +// Zen 2 and earlier +if (!strncmp(vendor.c, "AuthenticAMD", 12) && family < 25) +rval |= AV_CPU_FLAG_AVX2SLOW; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) @@ -194,8 +206,10 @@ int ff_get_cpu_flags_x86(void) functions using XMM registers are always faster on them. AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ -if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) +if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)){ rval |= AV_CPU_FLAG_AVXSLOW; +rval |= AV_CPU_FLAG_AVX2SLOW; +} } /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h index 937c697fa0..a42a15a997 100644 --- a/libavutil/x86/cpu.h +++ b/libavutil/x86/cpu.h @@ -78,6 +78,7 @@ #define EXTERNAL_AVX2(flags)CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2) #define EXTERNAL_AVX2_FAST(flags) CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, AVX2, AVX) #define EXTERNAL_AVX2_SLOW(flags) CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, AVX2, AVX) +#define EXTERNAL_AVX2_FAST_GATHER(flags) CPUEXT_SUFFIX_FAST(flags, _EXTERNAL, AVX2) #define EXTERNAL_AESNI(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI) #define EXTERNAL_AVX512(flags) CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512) -- 2.32.0.93.g670b81a890-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. --- libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 +++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 ++ tests/checkasm/sw_scale.c | 21 +-- 6 files changed, 187 insertions(+), 5 deletions(-) create mode 100644 libswscale/x86/scale_avx2.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index a1de95cee0..45ef657cd4 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1056,4 +1056,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index 6bac7b658d..07c4d2f741 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -267,6 +267,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST_GATHER(cpu_flags)){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1697,6 +1732,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1706,6 +1742,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm new file mode 100644 index 00..d90fd2d791 --- /dev/null +++ b/libswscale/x86/scale_avx2.asm @@ -0,0 +1,112 @@ +;***
Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
On Fri, Jun 25, 2021 at 10:40 AM Lynne wrote: > Jun 25, 2021, 09:54 by alankelly-at-google@ffmpeg.org: > > > Broadwell and later and Zen3 and later have fast gather instructions. > > --- > > Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell, > > and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3. > > libavutil/cpu.h | 2 ++ > > libavutil/x86/cpu.c | 18 -- > > libavutil/x86/cpu.h | 1 + > > 3 files changed, 19 insertions(+), 2 deletions(-) > > > > No, we really don't need more FAST/SLOW flags, especially for > something like this which is just fixable by _not_using_vgather_. > Take a look at libavutil/x86/tx_float.asm, we only use vgather > if it's guaranteed to either be faster for what we're gathering or > is just as fast "slow". If neither is true, we use manual lookups, > which is actually advantageous since for AVX2 we can interleave > the lookups that happen in each lane. > > Even if we disregard this, I've extensively benchmarked vgather > on Zen 3, Zen 2, Cascade Lake and Skylake, and there's hardly > a great vgather improvement to be found in Zen 3 to justify > using a new CPU flag for this. > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > Thanks for your response. I'm not against finding a cleaner way of enabling/disabling the code which will be protected by this flag. However, the manual lookups solution proposed will not work in this case, the avx2 version of hscale will only be faster if fast gathers are available, otherwise, the ssse3 version should be used. I haven't got access to a Zen3 so I can't comment on the performance. I have tested on a Zen 2 and it is slow. On Broadwell hscale avx2 is about 10% faster than the ssse3 version and on Skylake about 40% faster, Haswell has similar performance to Zen2. Is there a proxy which could be used for detecting Broadwell or Skylake and later? AVX512 seems too strict as there are Skylake chips without AVX512. Thanks ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
On Fri, Jun 25, 2021 at 1:26 PM Ronald S. Bultje wrote: > Hi Alan, > > On Fri, Jun 25, 2021 at 3:59 AM Alan Kelly < > alankelly-at-google@ffmpeg.org> wrote: > >> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. >> > > Re-asking a question I asked before in the other thread: > > Also, what is the cycle count of ssse3/avx2 implementation for this > specific function on Haswell? It would be good to note that in the > respective patch so that we understand why the check was added. > > You should be able to find this in the checkasm --bench --test=X numbers > for this relevant function. > > Ronald > Hi Ronald, Skylake Haswell hscale_8_to_15_width4_ssse3 761.2 760 hscale_8_to_15_width4_avx2 468.7 957 hscale_8_to_15_width8_ssse3 1170.7 1032 hscale_8_to_15_width8_avx2 865.7 1979 hscale_8_to_15_width12_ssse3 2172.2 2472 hscale_8_to_15_width12_avx2 1245.7 2901 hscale_8_to_15_width16_ssse3 2244.2 2400 hscale_8_to_15_width16_avx2 1647.2 3681 As you can see, it is catastrophic on Haswell. In the next iteration of the patch, I will update the description with these numbers. Thanks ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
On Fri, Jun 25, 2021 at 1:24 PM Alan Kelly wrote: > On Fri, Jun 25, 2021 at 10:40 AM Lynne wrote: > >> Jun 25, 2021, 09:54 by alankelly-at-google@ffmpeg.org: >> >> > Broadwell and later and Zen3 and later have fast gather instructions. >> > --- >> > Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on >> Broadwell, >> > and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3. >> > libavutil/cpu.h | 2 ++ >> > libavutil/x86/cpu.c | 18 -- >> > libavutil/x86/cpu.h | 1 + >> > 3 files changed, 19 insertions(+), 2 deletions(-) >> > >> >> No, we really don't need more FAST/SLOW flags, especially for >> something like this which is just fixable by _not_using_vgather_. >> Take a look at libavutil/x86/tx_float.asm, we only use vgather >> if it's guaranteed to either be faster for what we're gathering or >> is just as fast "slow". If neither is true, we use manual lookups, >> which is actually advantageous since for AVX2 we can interleave >> the lookups that happen in each lane. >> >> Even if we disregard this, I've extensively benchmarked vgather >> on Zen 3, Zen 2, Cascade Lake and Skylake, and there's hardly >> a great vgather improvement to be found in Zen 3 to justify >> using a new CPU flag for this. >> ___ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". >> > > Thanks for your response. I'm not against finding a cleaner way of > enabling/disabling the code which will be protected by this flag. However, > the manual lookups solution proposed will not work in this case, the avx2 > version of hscale will only be faster if fast gathers are available, > otherwise, the ssse3 version should be used. > > I haven't got access to a Zen3 so I can't comment on the performance. I > have tested on a Zen 2 and it is slow. On Broadwell hscale avx2 is about > 10% faster than the ssse3 version and on Skylake about 40% faster, Haswell > has similar performance to Zen2. > > Is there a proxy which could be used for detecting Broadwell or Skylake > and later? AVX512 seems too strict as there are Skylake chips without > AVX512. Thanks > Hi, I will paste the performance figures from the thread for the other part of this patch here so that the justification for this flag is clearer: Skylake Haswell hscale_8_to_15_width4_ssse3 761.2 760 hscale_8_to_15_width4_avx2 468.7 957 hscale_8_to_15_width8_ssse3 1170.7 1032 hscale_8_to_15_width8_avx2 865.7 1979 hscale_8_to_15_width12_ssse3 2172.2 2472 hscale_8_to_15_width12_avx2 1245.7 2901 hscale_8_to_15_width16_ssse3 2244.2 2400 hscale_8_to_15_width16_avx2 1647.2 3681 As you can see, it is catastrophic on Haswell and older chips but the gains on Skylake are impressive. As I don't have performance figures for Zen 3, I can disable this feature on all cpus apart from Broadwell and later as you say that there is no worthwhile improvement on Zen3. Is this OK with you? Thanks ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
Broadwell and later and Zen3 and later have fast gather instructions. --- Haswell is now excluded from EXTERNAL_AVX2_FAST as discussed in the email thread. libavutil/cpu.h | 1 + libavutil/x86/cpu.c | 11 ++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index c069076439..ec3073d021 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -113,6 +113,7 @@ void av_force_cpu_count(int count); * av_set_cpu_flags_mask(), then this function will behave as if AVX is not * present. */ + size_t av_cpu_max_align(void); #endif /* AVUTIL_CPU_H */ diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..158e2170c4 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,17 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){ rval |= AV_CPU_FLAG_AVX2; + +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +// Haswell and earlier has slow gather +if(family == 6 && model < 70) +rval |= AV_CPU_FLAG_AVXSLOW; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) -- 2.32.0.402.g57bb445576-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. --- EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as discussed in the email thread for part 1 of this patch. Benchmark results on Skylake and Haswell: Skylake Haswell hscale_8_to_15_width4_ssse3 761.2 760 hscale_8_to_15_width4_avx2 468.7 957 hscale_8_to_15_width8_ssse3 1170.7 1032 hscale_8_to_15_width8_avx2 865.7 1979 hscale_8_to_15_width12_ssse32172.2 2472 hscale_8_to_15_width12_avx2 1245.7 2901 hscale_8_to_15_width16_ssse32244.2 2400 hscale_8_to_15_width16_avx2 1647.2 3681 libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 +++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 ++ tests/checkasm/sw_scale.c | 20 -- 6 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 libswscale/x86/scale_avx2.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 673407636a..fba3dabe5b 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index 176fc6fd63..0577fd5490 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o
Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.
On Fri, Jul 16, 2021 at 4:02 PM James Almer wrote: > On 7/16/2021 10:44 AM, Alan Kelly wrote: > > Broadwell and later and Zen3 and later have fast gather instructions. > > --- > > Haswell is now excluded from EXTERNAL_AVX2_FAST as discussed in the > > email thread. > > I was very explicit about this not being ok. We're not disabling all ymm > usage for Haswell just for one or two swscale functions using gathers. > > Lets go with Lynne's latest suggestion and not change the flags at all > and use gathers on Haswell, same as other arches, by looking at the > AVX2_FAST flag. > > > libavutil/cpu.h | 1 + > > libavutil/x86/cpu.c | 11 ++- > > 2 files changed, 11 insertions(+), 1 deletion(-) > > > > diff --git a/libavutil/cpu.h b/libavutil/cpu.h > > index c069076439..ec3073d021 100644 > > --- a/libavutil/cpu.h > > +++ b/libavutil/cpu.h > > @@ -113,6 +113,7 @@ void av_force_cpu_count(int count); > >* av_set_cpu_flags_mask(), then this function will behave as if AVX > is not > >* present. > >*/ > > + > > size_t av_cpu_max_align(void); > > > > #endif /* AVUTIL_CPU_H */ > > diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c > > index bcd41a50a2..158e2170c4 100644 > > --- a/libavutil/x86/cpu.c > > +++ b/libavutil/x86/cpu.c > > @@ -146,8 +146,17 @@ int ff_get_cpu_flags_x86(void) > > if (max_std_level >= 7) { > > cpuid(7, eax, ebx, ecx, edx); > > #if HAVE_AVX2 > > -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) > > +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){ > > rval |= AV_CPU_FLAG_AVX2; > > + > > +cpuid(1, eax, ebx, ecx, std_caps); > > +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); > > +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); > > +// Haswell and earlier has slow gather > > +if(family == 6 && model < 70) > > +rval |= AV_CPU_FLAG_AVXSLOW; > > +} > > + > > #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ > > if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ > > if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == > 0xd003) > > > > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > OK, apologies for the misunderstanding. In that case part 1 of this patch is not required. Part two remains valid with the function protected by EXTERNAL_AVX2_FAST. Should part 2 be re-submitted as a standalone patch or is it OK as is? ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly wrote: > These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. > --- > EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as > discussed in the email thread for part 1 of this patch. > > Benchmark results on Skylake and Haswell: > > Skylake Haswell > hscale_8_to_15_width4_ssse3 761.2 760 > hscale_8_to_15_width4_avx2 468.7 957 > hscale_8_to_15_width8_ssse3 1170.7 1032 > hscale_8_to_15_width8_avx2 865.7 1979 > hscale_8_to_15_width12_ssse32172.2 2472 > hscale_8_to_15_width12_avx2 1245.7 2901 > hscale_8_to_15_width16_ssse32244.2 2400 > hscale_8_to_15_width16_avx2 1647.2 3681 > > libswscale/swscale_internal.h | 2 + > libswscale/utils.c| 37 +++ > libswscale/x86/Makefile | 1 + > libswscale/x86/scale_avx2.asm | 112 ++ > libswscale/x86/swscale.c | 19 ++ > tests/checkasm/sw_scale.c | 20 -- > 6 files changed, 186 insertions(+), 5 deletions(-) > create mode 100644 libswscale/x86/scale_avx2.asm > > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h > index 673407636a..fba3dabe5b 100644 > --- a/libswscale/swscale_internal.h > +++ b/libswscale/swscale_internal.h > @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, > yuv2planar1_fn yuv2plane1, yuv2planarX_fn > //number of extra lines to process > #define MAX_LINES_AHEAD 4 > > +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 > +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > diff --git a/libswscale/utils.c b/libswscale/utils.c > index 176fc6fd63..0577fd5490 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { > [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, > }; > > +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > +#if ARCH_X86_64 > +int i, j, k, l; > +int cpu_flags = av_get_cpu_flags(); > +if (EXTERNAL_AVX2_FAST(cpu_flags)){ > +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > +if (dstW % 16 == 0){ > +if (filter != NULL){ > +for (i = 0; i < dstW; i += 8){ > +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); > +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); > +} > +if (filterSize > 4){ > +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); > +memcpy(tmp2, filter, dstW * filterSize * 2); > +for (i = 0; i < dstW; i += 16){//pixel > +for (k = 0; k < filterSize / 4; ++k){//fcoeff > +for (j = 0; j < 16; ++j){//inner pixel > +for (l = 0; l < 4; ++l){//coeff > +int from = i * filterSize + j * > filterSize + k * 4 + l; > +int to = (i) * filterSize + j * 4 > + l + k * 64; > +filter[to] = tmp2[from]; > +} > +} > +} > +} > +av_free(tmp2); > +} > +} > +} > +} > +} > +#endif > +} > + > int sws_isSupportedInput(enum AVPixelFormat pix_fmt) > { > return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? > @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, 0, 0, 0), > get_local_pos(c, 0, 0, 0))) < 0) > goto fail; > +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, > c->hLumFilterSize, c->hLumFilter, dstW); > if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, > &c->hChrFilterSize, c->chrXInc, > c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, > @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > get_local_pos(c, c->chrSrcHSubSample, > c->src_h_chr_pos, 0), > get_local_pos(c, c->chrDstHSubSample, >
Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
On Wed, Jul 21, 2021 at 11:11 AM Alan Kelly wrote: > > > On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly wrote: > >> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. >> --- >> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as >> discussed in the email thread for part 1 of this patch. >> >> Benchmark results on Skylake and Haswell: >> >> Skylake Haswell >> hscale_8_to_15_width4_ssse3 761.2 760 >> hscale_8_to_15_width4_avx2 468.7 957 >> hscale_8_to_15_width8_ssse3 1170.7 1032 >> hscale_8_to_15_width8_avx2 865.7 1979 >> hscale_8_to_15_width12_ssse32172.2 2472 >> hscale_8_to_15_width12_avx2 1245.7 2901 >> hscale_8_to_15_width16_ssse32244.2 2400 >> hscale_8_to_15_width16_avx2 1647.2 3681 >> >> libswscale/swscale_internal.h | 2 + >> libswscale/utils.c| 37 +++ >> libswscale/x86/Makefile | 1 + >> libswscale/x86/scale_avx2.asm | 112 ++ >> libswscale/x86/swscale.c | 19 ++ >> tests/checkasm/sw_scale.c | 20 -- >> 6 files changed, 186 insertions(+), 5 deletions(-) >> create mode 100644 libswscale/x86/scale_avx2.asm >> >> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h >> index 673407636a..fba3dabe5b 100644 >> --- a/libswscale/swscale_internal.h >> +++ b/libswscale/swscale_internal.h >> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, >> yuv2planar1_fn yuv2plane1, yuv2planarX_fn >> //number of extra lines to process >> #define MAX_LINES_AHEAD 4 >> >> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 >> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int >> filterSize, int16_t *filter, int dstW); >> #endif /* SWSCALE_SWSCALE_INTERNAL_H */ >> diff --git a/libswscale/utils.c b/libswscale/utils.c >> index 176fc6fd63..0577fd5490 100644 >> --- a/libswscale/utils.c >> +++ b/libswscale/utils.c >> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = { >> [AV_PIX_FMT_X2RGB10LE] = { 1, 1 }, >> }; >> >> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int >> filterSize, int16_t *filter, int dstW){ >> +#if ARCH_X86_64 >> +int i, j, k, l; >> +int cpu_flags = av_get_cpu_flags(); >> +if (EXTERNAL_AVX2_FAST(cpu_flags)){ >> +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ >> +if (dstW % 16 == 0){ >> +if (filter != NULL){ >> +for (i = 0; i < dstW; i += 8){ >> +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); >> +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); >> +} >> +if (filterSize > 4){ >> +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); >> +memcpy(tmp2, filter, dstW * filterSize * 2); >> +for (i = 0; i < dstW; i += 16){//pixel >> +for (k = 0; k < filterSize / 4; ++k){//fcoeff >> +for (j = 0; j < 16; ++j){//inner pixel >> +for (l = 0; l < 4; ++l){//coeff >> +int from = i * filterSize + j * >> filterSize + k * 4 + l; >> +int to = (i) * filterSize + j * >> 4 + l + k * 64; >> +filter[to] = tmp2[from]; >> +} >> +} >> +} >> +} >> +av_free(tmp2); >> +} >> +} >> +} >> +} >> +} >> +#endif >> +} >> + >> int sws_isSupportedInput(enum AVPixelFormat pix_fmt) >> { >> return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? >> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, >> SwsFilter *srcFilter, >> get_local_pos(c, 0, 0, 0), >> get_local_pos(c, 0, 0, 0))) < 0) >> goto fail; >> +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, >> c->hLumFilterSize, c->hLumFilter, dstW); >> if ((ret = initFilter(&c->hChrFilter, &c->hChrF
[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup
--- libswscale/x86/swscale.c | 138 --- 1 file changed, 72 insertions(+), 66 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..e47fee2bbd 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { -if(((uintptr_t)dest) & 15){ +if(((uintptr_t)dest) & 31){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} +__asm__ volatile( +"vmovq%5, %%xmm3\n\t" +"cmpl $0, %3\n\t" +"jz 2f\n\t" + +"# offset != 0 path.\n\t" +"vpsrlq $24, %%xmm3, %%xmm5\n\t" +"vpsllq $40, %%xmm3, %%xmm3\n\t" +"vpor %%xmm3, %%xmm5, %%xmm3\n\t" + +"2: \n\t" +"vpxor%%xmm0, %%xmm0, %%xmm0\n\t" +"mov(%0), %%"FF_REG_S" \n\t" +"vpunpc
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, although local tests show a significant speed-up
Other functions to be ported to avx2 have been identified and are on the todo list. --- libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 72 +++-- libswscale/x86/yuv2yuvX.asm | 105 3 files changed, 112 insertions(+), 66 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..ea83b097ca 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,6 +197,10 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) @@ -205,72 +209,8 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, however, although local tests show a significant
Fixed. The wrong step size was used causing a write passed the end of the buffer. yuv2yuvX_mmxext is now called if there are any remaining pixels. --- libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 -- libswscale/x86/yuv2yuvX.asm | 105 3 files changed, 116 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..fec9fa22e0 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c -
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, although local tests show a significant spee
, %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); +ff_yuv2yuvX_sse3(filter, filterSize - 1, dest - offset, pixelsProcessed + offset, dither, offset); +if(remainder > 0){ + yuv2yuvX_mmxext(filter, filterSize, src, dest + pixelsProcessed, remainder, dither, offset + pixelsProcessed); } +return; } #endif diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm new file mode 100644 index 00..84727de599 --- /dev/null +++ b/libswscale/x86/yuv2yuvX.asm @@ -0,0 +1,105 @@ +;** +;* x86-optimized yuv2yuvX +;* Copyright 2020 Google LLC +;* +;* This file is part of FFmpeg. +;* +;* FFmpeg is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* FFmpeg is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with FFmpeg; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;** + +%include "libavutil/x86/x86util.asm" + +SECTION .text + +;- +; yuv2yuvX +; +; void ff_yuv2yuvX_(const int16_t *filter, int filterSize, +;uint8_t *dest, int dstW, +;const uint8_t *dither, int offset); +; +;- + +%macro YUV2YUVX_FUNC 0 +cglobal yuv2yuvX, 6, 7, 16, filter, rsi, dest, dstW, dither, offset, src +%if ARCH_X86_64 +movsxd dstWq, dstWd +movsxd offsetq, offsetd +%endif ; x86-64 +movq xmm3, [ditherq] +cmp offsetd, 0 +jz .offset + +; offset != 0 path. +psrlqm5, m3, $18 +psllqm3, m3, $28 +por m3, m3, m5 + +.offset: +%if cpuflag(avx2) +vperm2i128 m3, m3, m3, 0 +%endif ; avx2 +%if ARCH_X86_64 +movq xmm1, rsiq +%else +movd mm1, rsi +%endif +vpbroadcastw m1, xmm1 +pxor m0, m0, m0 +mov rsiq, filterq +mov srcq, [rsiq] +punpcklbwm3, m0 +psllwm1, m1, 3 +paddwm3, m3, m1 +psrawm7, m3, 4 +.outerloop: +mova m4, m7 +mova m3, m7 +mova m6, m7 +mova m1, m7 +.loop: +vpbroadcastq m0, [rsiq + 8] +pmulhw m2, m0, [srcq + offsetq * 2] +pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] +paddwm3, m3, m2 +paddwm4, m4, m5 +pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] +pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] + paddw m6,
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 -- libswscale/x86/yuv2yuvX.asm | 105 3 files changed, 116 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..fec9fa22e0 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g"
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, however, although local tests show a signifi
Thanks for the review, I have made the required changes. As I have changed the subject the patch is in a new thread. On Fri, Oct 23, 2020 at 4:10 PM James Almer wrote: > On 10/23/2020 10:17 AM, Alan Kelly wrote: > > Fixed. The wrong step size was used causing a write passed the end of > > the buffer. yuv2yuvX_mmxext is now called if there are any remaining > pixels. > > Please fix the commit subject (It's too long and contains commentary), > and keep comments about fixes between versions outside of the commit > message body. You can manually place them after the --- below, or in a > separate reply. > > > --- > > libswscale/x86/Makefile | 1 + > > libswscale/x86/swscale.c| 75 -- > > libswscale/x86/yuv2yuvX.asm | 105 > > 3 files changed, 116 insertions(+), 65 deletions(-) > > create mode 100644 libswscale/x86/yuv2yuvX.asm > > > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > > index 831d5359aa..bfe383364e 100644 > > --- a/libswscale/x86/Makefile > > +++ b/libswscale/x86/Makefile > > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o > \ > > x86/scale.o > \ > > x86/rgb_2_rgb.o > \ > > x86/yuv_2_rgb.o > \ > > + x86/yuv2yuvX.o > \ > > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > > index 3160fedf04..fec9fa22e0 100644 > > --- a/libswscale/x86/swscale.c > > +++ b/libswscale/x86/swscale.c > > @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int > dstY) > > } > > > > #if HAVE_MMXEXT > > +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, > > + uint8_t *dest, int dstW, > > + const uint8_t *dither, int offset); > > + > > static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, > > const int16_t **src, uint8_t *dest, int dstW, > > const uint8_t *dither, int offset) > > { > > +int remainder = (dstW % 32); > > +int pixelsProcessed = dstW - remainder; > > if(((uintptr_t)dest) & 15){ > > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); > > return; > > } > > -filterSize--; > > -#define MAIN_FUNCTION \ > > -"pxor %%xmm0, %%xmm0 \n\t" \ > > -"punpcklbw %%xmm0, %%xmm3 \n\t" \ > > -"movd %4, %%xmm1 \n\t" \ > > -"punpcklwd %%xmm1, %%xmm1 \n\t" \ > > -"punpckldq %%xmm1, %%xmm1 \n\t" \ > > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ > > -"psllw $3, %%xmm1 \n\t" \ > > -"paddw %%xmm1, %%xmm3 \n\t" \ > > -"psraw $4, %%xmm3 \n\t" \ > > -"movdqa %%xmm3, %%xmm4 \n\t" \ > > -"movdqa %%xmm3, %%xmm7 \n\t" \ > > -"movl %3, %%ecx \n\t" \ > > -"mov %0, %%"FF_REG_d" > \n\t"\ > > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > > -".p2align 4 \n\t" /* > FIXME Unroll? */\ > > -"1: \n\t"\ > > -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* > filterCoeff */\ > > -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 > \n\t" /* srcData */\ > > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 > \n\t" /* srcData */\ > > -"add$16, %%"FF_REG_d" > \n\t"\ > > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > > -"test %%"FF_REG_S", %%"FF_REG_S" > \n\t"\ > > -"pmulhw %%xmm0, %%xmm2 \n\t"\ > > -"pmulhw %%xmm0, %%xmm5 \n\t"\ > > -"paddw%%xmm2, %%xmm3 \n\t"\ > > -"paddw
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Apologies for the multiple threads, my git send-email was wrongly configured. This has been fixed. This code has been tested on AVX2 giving a significant speedup, however, until the ff_hscale* functions are ported to avx2, this should not be enabled as it results in an overall slowdown of swscale probably due to cpu frequency scaling. checkasm will follow in a separate patch. On Tue, Oct 27, 2020 at 9:56 AM Alan Kelly wrote: > --- > libswscale/x86/Makefile | 1 + > libswscale/x86/swscale.c| 75 -- > libswscale/x86/yuv2yuvX.asm | 105 > 3 files changed, 116 insertions(+), 65 deletions(-) > create mode 100644 libswscale/x86/yuv2yuvX.asm > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index 831d5359aa..bfe383364e 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o > \ > x86/scale.o \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > + x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 3160fedf04..fec9fa22e0 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int > dstY) > } > > #if HAVE_MMXEXT > +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, > + uint8_t *dest, int dstW, > + const uint8_t *dither, int offset); > + > static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, > const int16_t **src, uint8_t *dest, int dstW, > const uint8_t *dither, int offset) > { > +int remainder = (dstW % 32); > +int pixelsProcessed = dstW - remainder; > if(((uintptr_t)dest) & 15){ > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); > return; > } > -filterSize--; > -#define MAIN_FUNCTION \ > -"pxor %%xmm0, %%xmm0 \n\t" \ > -"punpcklbw %%xmm0, %%xmm3 \n\t" \ > -"movd %4, %%xmm1 \n\t" \ > -"punpcklwd %%xmm1, %%xmm1 \n\t" \ > -"punpckldq %%xmm1, %%xmm1 \n\t" \ > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ > -"psllw $3, %%xmm1 \n\t" \ > -"paddw %%xmm1, %%xmm3 \n\t" \ > -"psraw $4, %%xmm3 \n\t" \ > -"movdqa %%xmm3, %%xmm4 \n\t" \ > -"movdqa %%xmm3, %%xmm7 \n\t" \ > -"movl %3, %%ecx \n\t" \ > -"mov %0, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -".p2align 4 \n\t" /* > FIXME Unroll? */\ > -"1: \n\t"\ > -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* > filterCoeff */\ > -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 > \n\t" /* srcData */\ > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 > \n\t" /* srcData */\ > -"add$16, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -"test %%"FF_REG_S", %%"FF_REG_S" > \n\t"\ > -"pmulhw %%xmm0, %%xmm2 \n\t"\ > -"pmulhw %%xmm0, %%xmm5 \n\t"\ > -"paddw%%xmm2, %%xmm3 \n\t"\ > -"paddw%%xmm5, %%xmm4 \n\t"\ > -" jnz1b \n\t"\ > -"psraw $3, %%xmm3 \n\t"\ > -"psraw $3, %%xmm4 \n\t"\ > -"packuswb %%xmm4, %%xmm3 \n\t"\ > -"movntdq %%xmm3, (%1, %%"FF_REG_c") > \n\t"
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 - libswscale/x86/yuv2yuvX.asm | 109 3 files changed, 120 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..fec9fa22e0 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Thanks for the feedback Anton. The second patch incorporates changes suggested by James Almer: avx2 instructions are wrapped in if cpuflag(avx2) and movddup restored mm1 is replaced by m1 on x86_32 On Tue, Oct 27, 2020 at 10:40 AM Anton Khirnov wrote: > Hi, > Quoting Alan Kelly (2020-10-27 10:10:14) > > --- > > libswscale/x86/Makefile | 1 + > > libswscale/x86/swscale.c| 75 - > > libswscale/x86/yuv2yuvX.asm | 109 > > 3 files changed, 120 insertions(+), 65 deletions(-) > > create mode 100644 libswscale/x86/yuv2yuvX.asm > > > > No comments on the code itself (yet?), but as for your submission: > - when you send multiple iterations of the same patch, it is helpful to > mention what changed, e.g. with git send-email --annotate > - the commit message should follow the standard format of: > * swscale: short summary of the change > > Extended description of the commit, if needed. > > -- > Anton Khirnov > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
The function was re-written in asm, this code is heavily derived from the original code, the algorithm remains unchanged, the implementation is optimized. Would you agree to adding the copyright from swscale.c: * Copyright (C) 2001-2011 Michael Niedermayer to this file, having both copyrights? Thank you. On Sat, Oct 31, 2020 at 1:02 PM Carl Eugen Hoyos wrote: > Am Di., 27. Okt. 2020 um 09:56 Uhr schrieb Alan Kelly > : > > > --- /dev/null > > +++ b/libswscale/x86/yuv2yuvX.asm > > @@ -0,0 +1,105 @@ > > > +;** > > +;* x86-optimized yuv2yuvX > > +;* Copyright 2020 Google LLC > > Either the commit message ("move a function") or this > copyright statement is wrong, please fix this. > > Please do not commit as-is... > > Carl Eugen > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- yuv2yuvX.asm: Ports yuv2yuvX to asm, unrolls main loop and adds other small optimizations for ~20% speed-up. Copyright updated to include the original from swscale.c swscale.c: Removes yuv2yuvX_sse3 and calls new function ff_yuv2yuvX_sse3. Calls yuv2yuvX_mmxext on remainining elements if required. Makefile: Compiles yuv2yuvX.asm libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 libswscale/x86/yuv2yuvX.asm | 110 3 files changed, 121 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..fec9fa22e0 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- It now works on x86-32 libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 libswscale/x86/yuv2yuvX.asm | 110 3 files changed, 121 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..758c8e540f 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Fixes bug in sse3 path where m1 is not set correctly resulting in off by one errors. The results are now bit by bit identical. libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 libswscale/x86/yuv2yuvX.asm | 114 3 files changed, 125 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..758c8e540f 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} el
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- All of Henrik's suggestions have been implemented. Additionally, m3 and m6 are permuted in avx2 before storing to ensure bit by bit identical results in avx2. libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 75 +++ libswscale/x86/yuv2yuvX.asm | 118 3 files changed, 129 insertions(+), 65 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..758c8e540f 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT +void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize, + uint8_t *dest, int dstW, + const uint8_t *dither, int offset); + static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset) { +int remainder = (dstW % 32); +int pixelsProcessed = dstW - remainder; if(((uintptr_t)dest) & 15){ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); return; } -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_R
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Ping On Thu, Nov 19, 2020 at 9:42 AM Alan Kelly wrote: > --- > All of Henrik's suggestions have been implemented. Additionally, > m3 and m6 are permuted in avx2 before storing to ensure bit by bit > identical results in avx2. > libswscale/x86/Makefile | 1 + > libswscale/x86/swscale.c| 75 +++ > libswscale/x86/yuv2yuvX.asm | 118 > 3 files changed, 129 insertions(+), 65 deletions(-) > create mode 100644 libswscale/x86/yuv2yuvX.asm > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index 831d5359aa..bfe383364e 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o > \ > x86/scale.o \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > + x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 3160fedf04..758c8e540f 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int > dstY) > } > > #if HAVE_MMXEXT > +void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize, > + uint8_t *dest, int dstW, > + const uint8_t *dither, int offset); > + > static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, > const int16_t **src, uint8_t *dest, int dstW, > const uint8_t *dither, int offset) > { > +int remainder = (dstW % 32); > +int pixelsProcessed = dstW - remainder; > if(((uintptr_t)dest) & 15){ > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); > return; > } > -filterSize--; > -#define MAIN_FUNCTION \ > -"pxor %%xmm0, %%xmm0 \n\t" \ > -"punpcklbw %%xmm0, %%xmm3 \n\t" \ > -"movd %4, %%xmm1 \n\t" \ > -"punpcklwd %%xmm1, %%xmm1 \n\t" \ > -"punpckldq %%xmm1, %%xmm1 \n\t" \ > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ > -"psllw $3, %%xmm1 \n\t" \ > -"paddw %%xmm1, %%xmm3 \n\t" \ > -"psraw $4, %%xmm3 \n\t" \ > -"movdqa %%xmm3, %%xmm4 \n\t" \ > -"movdqa %%xmm3, %%xmm7 \n\t" \ > -"movl %3, %%ecx \n\t" \ > -"mov %0, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -".p2align 4 \n\t" /* > FIXME Unroll? */\ > -"1: \n\t"\ > -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* > filterCoeff */\ > -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 > \n\t" /* srcData */\ > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 > \n\t" /* srcData */\ > -"add$16, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -"test %%"FF_REG_S", %%"FF_REG_S" > \n\t"\ > -"pmulhw %%xmm0, %%xmm2 \n\t"\ > -"pmulhw %%xmm0, %%xmm5 \n\t"\ > -"paddw%%xmm2, %%xmm3 \n\t"\ > -"paddw%%xmm5, %%xmm4 \n\t"\ > -" jnz1b \n\t"\ > -"psraw $3, %%xmm3 \n\t"\ > -"psraw $3, %%xmm4 \n\t"\ > -"packuswb %%xmm4, %%xmm3 \n\t"\ > -"movntdq %%xmm3, (%1, %%"FF_REG_c") > \n\t"\ > -"add $16, %%"FF_REG_c"\n\t"\ > -"cmp %2, %%"FF_REG_c"\n\t"\ > -"mo
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Activates avx2 version of yuv2yuvX Adds checkasm for yuv2yuvX Modifies ff_yuv2yuvX_* signature to match yuv2yuvX_* Replaces non-temporal stores with temporal stores libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 106 +--- libswscale/x86/yuv2yuvX.asm | 118 tests/checkasm/sw_scale.c | 101 +- 4 files changed, 249 insertions(+), 77 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..8cd8713705 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), -
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
This function is tested by fate-filter-fps-r. I have also added a checkasm test and bench. I have done a lot more testing and benching of this code and I am now happy to activate the avx2 version because the performance is so good. On my machine I get the following results for filter size 4 and 0 offset. For all other sizes/offsets the results are similar: yuv2yuvX_4_0_mmx: 1567.2 1563.1 yuv2yuvX_4_0_mmxext: 1560.7 1560.1 yuv2yuvX_4_0_sse3: 780.7 572.1 -26.7% yuv2yuvX_4_0_avx2: n/a 341.1 -56.3% Interestingly I discovered that the non-temporal store movntdq results in a very large variability in the test results, in many cases it significantly increases the execution time. I have replaced these stores with aligned stores which stabilised the runtimes. However, I am aware that benchmarks often don't represent reality and these non-temporal stores were probably used for a good reason. If you think it better to use NT stores, I will replace them. On Fri, Dec 4, 2020 at 2:00 PM Anton Khirnov wrote: > Quoting Alan Kelly (2020-11-19 09:41:56) > > --- > > All of Henrik's suggestions have been implemented. Additionally, > > m3 and m6 are permuted in avx2 before storing to ensure bit by bit > > identical results in avx2. > > libswscale/x86/Makefile | 1 + > > libswscale/x86/swscale.c| 75 +++ > > libswscale/x86/yuv2yuvX.asm | 118 > > 3 files changed, 129 insertions(+), 65 deletions(-) > > create mode 100644 libswscale/x86/yuv2yuvX.asm > > Is this function tested by FATE? > I did some brief testing and apparently it gets called during > fate-filter-shuffleplanes-dup-luma, but the results do not change even > if I comment out the whole function. > > Also, it seems like you are adding an AVX2 version of the function, but > I don't see it being used. > > -- > Anton Khirnov > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Replaces ff_sws_init_swscale_x86 with ff_getSwsFunc Load offset if not gprsize but 8 on both 32 and 64 bit Removes sfence as NT store no longer used libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 106 +--- libswscale/x86/yuv2yuvX.asm | 117 tests/checkasm/sw_scale.c | 101 ++- 4 files changed, 248 insertions(+), 77 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..8cd8713705 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Fixes memory alignment problem in checkasm-sw_scale Tested on Linux 32 and 64 bit and mingw32 libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 106 +--- libswscale/x86/yuv2yuvX.asm | 117 tests/checkasm/sw_scale.c | 98 ++ 4 files changed, 246 insertions(+), 76 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..8cd8713705 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), -
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Ping! On Thu, Dec 17, 2020 at 11:42 AM Alan Kelly wrote: > --- > Fixes memory alignment problem in checkasm-sw_scale > Tested on Linux 32 and 64 bit and mingw32 > libswscale/x86/Makefile | 1 + > libswscale/x86/swscale.c| 106 +--- > libswscale/x86/yuv2yuvX.asm | 117 > tests/checkasm/sw_scale.c | 98 ++ > 4 files changed, 246 insertions(+), 76 deletions(-) > create mode 100644 libswscale/x86/yuv2yuvX.asm > > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile > index 831d5359aa..bfe383364e 100644 > --- a/libswscale/x86/Makefile > +++ b/libswscale/x86/Makefile > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o > \ > x86/scale.o \ > x86/rgb_2_rgb.o \ > x86/yuv_2_rgb.o \ > + x86/yuv2yuvX.o \ > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 3160fedf04..8cd8713705 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int > dstY) > } > > #if HAVE_MMXEXT > -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, > - const int16_t **src, uint8_t *dest, int dstW, > - const uint8_t *dither, int offset) > -{ > -if(((uintptr_t)dest) & 15){ > -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); > -return; > -} > -filterSize--; > -#define MAIN_FUNCTION \ > -"pxor %%xmm0, %%xmm0 \n\t" \ > -"punpcklbw %%xmm0, %%xmm3 \n\t" \ > -"movd %4, %%xmm1 \n\t" \ > -"punpcklwd %%xmm1, %%xmm1 \n\t" \ > -"punpckldq %%xmm1, %%xmm1 \n\t" \ > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ > -"psllw $3, %%xmm1 \n\t" \ > -"paddw %%xmm1, %%xmm3 \n\t" \ > -"psraw $4, %%xmm3 \n\t" \ > -"movdqa %%xmm3, %%xmm4 \n\t" \ > -"movdqa %%xmm3, %%xmm7 \n\t" \ > -"movl %3, %%ecx \n\t" \ > -"mov %0, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -".p2align 4 \n\t" /* > FIXME Unroll? */\ > -"1: \n\t"\ > -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* > filterCoeff */\ > -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 > \n\t" /* srcData */\ > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 > \n\t" /* srcData */\ > -"add$16, %%"FF_REG_d" > \n\t"\ > -"mov(%%"FF_REG_d"), %%"FF_REG_S" > \n\t"\ > -"test %%"FF_REG_S", %%"FF_REG_S" > \n\t"\ > -"pmulhw %%xmm0, %%xmm2 \n\t"\ > -"pmulhw %%xmm0, %%xmm5 \n\t"\ > -"paddw%%xmm2, %%xmm3 \n\t"\ > -"paddw%%xmm5, %%xmm4 \n\t"\ > -" jnz1b \n\t"\ > -"psraw $3, %%xmm3 \n\t"\ > -"psraw $3, %%xmm4 \n\t"\ > -"packuswb %%xmm4, %%xmm3 \n\t"\ > -"movntdq %%xmm3, (%1, %%"FF_REG_c") > \n\t"\ > -"add $16, %%"FF_REG_c"\n\t"\ > -"cmp %2, %%"FF_REG_c"\n\t"\ > -"movdqa %%xmm7, %%xmm3\n\t" \ > -"movdqa %%xmm7, %%xmm4\n\t" \ > -"mov %0, %%"FF_REG_d" > \n\t"\ > -&q
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Replaces mova with movdqu due to alignment issues libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c| 106 +--- libswscale/x86/yuv2yuvX.asm | 117 tests/checkasm/sw_scale.c | 98 ++ 4 files changed, 246 insertions(+), 76 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 3160fedf04..8cd8713705 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3, %%xmm4 \n\t" -"psrlq$24, %%xmm3 \n\t" -"psllq$40, %%xmm4 \n\t" -"por %%xmm4, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *) dither)[0]) - : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) -"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c - ); -} else { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -MAIN_FUNCTION - :: "g" (filter), - "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), - "m"(filterSize), "m"(((uint64_t *)
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Thanks for your patience with this, I have replaced mova with movdqu - movu generated a compile error on ssse3. What system did this crash on? On Wed, Jan 6, 2021 at 9:10 PM Michael Niedermayer wrote: > On Tue, Jan 05, 2021 at 01:31:25PM +0100, Alan Kelly wrote: > > Ping! > > crashes (due to alignment i think) > > (gdb) disassemble $rip-32,$rip+32 > Dump of assembler code from 0x555730a1 to 0x555730e1: >0x555730a1 : int$0x71 >0x555730a3 : out%al,$0x3 >0x555730a5 : vpsraw $0x3,%ymm1,%ymm1 >0x555730aa : vpackuswb %ymm4,%ymm3,%ymm3 >0x555730ae : vpackuswb %ymm1,%ymm6,%ymm6 >0x555730b2 : mov(%rdi),%rdx >0x555730b5 : vpermq $0xd8,%ymm3,%ymm3 >0x555730bb : vpermq $0xd8,%ymm6,%ymm6 > => 0x555730c1 : vmovdqa %ymm3,(%rcx,%rax,1) >0x555730c6 : vmovdqa > %ymm6,0x20(%rcx,%rax,1) >0x555730cc : add$0x40,%rax >0x555730d0 : mov%rdi,%rsi >0x555730d3 : cmp%r8,%rax >0x555730d6 : jb 0x5557304d > >0x555730dc : vzeroupper >0x555730df : retq >0x555730e0 : push %r15 > End of assembler dump. > (gdb) info all-registers > rax0x0 0 > rbx0x0 0 > rcx0x5583f470 93824995292272 > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Modern terrorism, a quick summary: Need oil, start war with country that > has oil, kill hundread thousand in war. Let country fall into chaos, > be surprised about raise of fundamantalists. Drop more bombs, kill more > people, be surprised about them taking revenge and drop even more bombs > and strip your own citizens of their rights and freedoms. to be continued > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
It's a bug in the patch. The tail not processed by the sse3/avx2 version is done by the mmx version. I used offset to account for the src pixels already processed, however, dither is modified if offset is not 0. In cases where there is a tail and offset is 0, this bug appears. I am working on a solution. On Sun, Jan 10, 2021 at 4:26 PM Michael Niedermayer wrote: > On Thu, Jan 07, 2021 at 10:41:19AM +0100, Alan Kelly wrote: > > --- > > Replaces mova with movdqu due to alignment issues > > libswscale/x86/Makefile | 1 + > > libswscale/x86/swscale.c| 106 +--- > > libswscale/x86/yuv2yuvX.asm | 117 > > tests/checkasm/sw_scale.c | 98 ++ > > 4 files changed, 246 insertions(+), 76 deletions(-) > > create mode 100644 libswscale/x86/yuv2yuvX.asm > > I have one / some ? cases where this changes output > ./ffmpeg -i utvideo-yuv422p10le_UQY2_crc32-A431CD5F.avi -bitexact avi.avi > > i dont know if theres a decoder bug or bug in the patch or something else > > -rw-r- 1 michael michael 246218 Jan 10 16:23 avi.avi > -rw-r- 1 michael michael 245824 Jan 10 16:23 avi-ref.avi > > file should be at: > https://samples.ffmpeg.org/ffmpeg-bugs/trac/ticket4044/ > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > In a rich man's house there is no place to spit but his face. > -- Diogenes of Sinope > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Fixes a bug where if there is no offset and a tail which is not processed by the sse3/avx2 version the dither is modified Deletes mmx/mmxext yuv2yuvX version from swscale_template and adds it to yuv2yuvX.asm to reduce code duplication and so that it may be used to process the tail from the larger cardinal simd versions. src argument of yuv2yuvX_* is now srcOffset, so that tails and offsets are accounted for correctly. Changes input size in checkasm so that this corner case is tested. libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c | 130 libswscale/x86/swscale_template.c | 82 -- libswscale/x86/yuv2yuvX.asm | 136 ++ tests/checkasm/sw_scale.c | 100 ++ 5 files changed, 291 insertions(+), 158 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 15c0b22f20..3df193a067 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 0x0001000100010001ULL; +#define YUV2YUVX_FUNC_DECL(opt) \ +static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const int16_t **src, \ + uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset); \ + +YUV2YUVX_FUNC_DECL(mmx) +YUV2YUVX_FUNC_DECL(mmxext) +YUV2YUVX_FUNC_DECL(sse3) +YUV2YUVX_FUNC_DECL(avx2) + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov
Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
Apologies for this: when I added mmx to the yasm file, I added a macro for the stores selecting mova for mmx and movdqu for the others. if cpuflag(mmx) evaluates to true for all architectures so I replaced it with if notcpuflag(sse3). The alignment in the checkasm test has been changed to 8 from 32 so that the test catches problems with alignment. On Thu, Jan 14, 2021 at 1:11 AM Michael Niedermayer wrote: > On Mon, Jan 11, 2021 at 05:46:31PM +0100, Alan Kelly wrote: > > --- > > Fixes a bug where if there is no offset and a tail which is not > processed by the > > sse3/avx2 version the dither is modified > > Deletes mmx/mmxext yuv2yuvX version from swscale_template and adds it > > to yuv2yuvX.asm to reduce code duplication and so that it may be used > > to process the tail from the larger cardinal simd versions. > > src argument of yuv2yuvX_* is now srcOffset, so that tails and offsets > > are accounted for correctly. > > Changes input size in checkasm so that this corner case is tested. > > > > libswscale/x86/Makefile | 1 + > > libswscale/x86/swscale.c | 130 > > libswscale/x86/swscale_template.c | 82 -- > > libswscale/x86/yuv2yuvX.asm | 136 ++ > > tests/checkasm/sw_scale.c | 100 ++ > > 5 files changed, 291 insertions(+), 158 deletions(-) > > create mode 100644 libswscale/x86/yuv2yuvX.asm > > This seems to be crashing again unless i messed up testing > > (gdb) disassemble $rip-32,$rip+32 > Dump of assembler code from 0x55572f02 to 0x55572f42: >0x55572f02 : int$0x71 >0x55572f04 : out%al,$0x3 >0x55572f06 : vpsraw $0x3,%ymm1,%ymm1 >0x55572f0b : vpackuswb %ymm4,%ymm3,%ymm3 >0x55572f0f : vpackuswb %ymm1,%ymm6,%ymm6 >0x55572f13 : mov(%rdi),%rdx >0x55572f16 : vpermq $0xd8,%ymm3,%ymm3 >0x55572f1c : vpermq $0xd8,%ymm6,%ymm6 > => 0x55572f22 : vmovdqa %ymm3,(%rcx,%rax,1) >0x55572f27 : vmovdqa > %ymm6,0x20(%rcx,%rax,1) >0x55572f2d : add$0x40,%rax >0x55572f31 : mov%rdi,%rsi >0x55572f34 : cmp%r8,%rax >0x55572f37 : jb 0x55572eae > >0x55572f3d : vzeroupper >0x55572f40 : retq >0x55572f41 : nopw %cs:0x0(%rax,%rax,1) > > rax0x0 0 > rbx0x30 48 > rcx0x5583f470 93824995292272 > rdx0x5585e500 93824995419392 > > #0 0x55572f22 in ff_yuv2yuvX_avx2 () > #1 0x555724ee in yuv2yuvX_avx2 () > #2 0x5556b4f6 in chr_planar_vscale () > #3 0x55566d41 in swscale () > #4 0x55568284 in sws_scale () > > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > What does censorship reveal? It reveals fear. -- Julian Assange > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.
--- Replaces cpuflag(mmx) with notcpuflag(sse3) for store macro Tests for multiple sizes in checkasm-sw_scale checkasm-sw_scale aligns memory on 8 bytes instad of 32 to catch aligned loads libswscale/x86/Makefile | 1 + libswscale/x86/swscale.c | 130 libswscale/x86/swscale_template.c | 82 -- libswscale/x86/yuv2yuvX.asm | 136 ++ tests/checkasm/sw_scale.c | 103 ++ 5 files changed, 294 insertions(+), 158 deletions(-) create mode 100644 libswscale/x86/yuv2yuvX.asm diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index 831d5359aa..bfe383364e 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o \ x86/scale.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ + x86/yuv2yuvX.o \ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 15c0b22f20..3df193a067 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL; DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 0x0001000100010001ULL; +#define YUV2YUVX_FUNC_DECL(opt) \ +static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const int16_t **src, \ + uint8_t *dest, int dstW, \ + const uint8_t *dither, int offset); \ + +YUV2YUVX_FUNC_DECL(mmx) +YUV2YUVX_FUNC_DECL(mmxext) +YUV2YUVX_FUNC_DECL(sse3) +YUV2YUVX_FUNC_DECL(avx2) + //MMX versions #if HAVE_MMX_INLINE #undef RENAME @@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY) } #if HAVE_MMXEXT -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, - const int16_t **src, uint8_t *dest, int dstW, - const uint8_t *dither, int offset) -{ -if(((uintptr_t)dest) & 15){ -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); -return; -} -filterSize--; -#define MAIN_FUNCTION \ -"pxor %%xmm0, %%xmm0 \n\t" \ -"punpcklbw %%xmm0, %%xmm3 \n\t" \ -"movd %4, %%xmm1 \n\t" \ -"punpcklwd %%xmm1, %%xmm1 \n\t" \ -"punpckldq %%xmm1, %%xmm1 \n\t" \ -"punpcklqdq %%xmm1, %%xmm1 \n\t" \ -"psllw $3, %%xmm1 \n\t" \ -"paddw %%xmm1, %%xmm3 \n\t" \ -"psraw $4, %%xmm3 \n\t" \ -"movdqa %%xmm3, %%xmm4 \n\t" \ -"movdqa %%xmm3, %%xmm7 \n\t" \ -"movl %3, %%ecx \n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -".p2align 4 \n\t" /* FIXME Unroll? */\ -"1: \n\t"\ -"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ -"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ -"add$16, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ -"pmulhw %%xmm0, %%xmm2 \n\t"\ -"pmulhw %%xmm0, %%xmm5 \n\t"\ -"paddw%%xmm2, %%xmm3 \n\t"\ -"paddw%%xmm5, %%xmm4 \n\t"\ -" jnz1b \n\t"\ -"psraw $3, %%xmm3 \n\t"\ -"psraw $3, %%xmm4 \n\t"\ -"packuswb %%xmm4, %%xmm3 \n\t"\ -"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ -"add $16, %%"FF_REG_c"\n\t"\ -"cmp %2, %%"FF_REG_c"\n\t"\ -"movdqa %%xmm7, %%xmm3\n\t" \ -"movdqa %%xmm7, %%xmm4\n\t" \ -"mov %0, %%"FF_REG_d"\n\t"\ -"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ -"jb 1b \n\t" - -if (offset) { -__asm__ volatile( -"movq %5, %%xmm3 \n\t" -"movdqa%%xmm3
[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
Patch has been rebased from latest commits. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. --- libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 +++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 ++ tests/checkasm/sw_scale.c | 20 -- 6 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 libswscale/x86/scale_avx2.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 708facba67..64aa0b9804 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index ae92ac9fbc..d4a72d3ce1 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 0 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm new file mode 100644 index 00..d90fd2d791 --- /dev/null +++ b/libswscale/x86/scale_avx2.asm @@ -0,0 +1,112 @@ +;**
Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
On Tue, Dec 14, 2021 at 6:07 PM James Almer wrote: > On 12/14/2021 12:23 PM, Alan Kelly wrote: > > Patch has been rebased from latest commits. > > These functions replace all ff_hscale8to15_*_ssse3 when avx2 is > available. > > --- > > libswscale/swscale_internal.h | 2 + > > libswscale/utils.c| 37 +++ > > libswscale/x86/Makefile | 1 + > > libswscale/x86/scale_avx2.asm | 112 ++ > > libswscale/x86/swscale.c | 19 ++ > > tests/checkasm/sw_scale.c | 20 -- > > 6 files changed, 186 insertions(+), 5 deletions(-) > > create mode 100644 libswscale/x86/scale_avx2.asm > > > > diff --git a/libswscale/swscale_internal.h > b/libswscale/swscale_internal.h > > index 708facba67..64aa0b9804 100644 > > --- a/libswscale/swscale_internal.h > > +++ b/libswscale/swscale_internal.h > > @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, > int threadnr, > > //number of extra lines to process > > #define MAX_LINES_AHEAD 4 > > > > +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 > > +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > > diff --git a/libswscale/utils.c b/libswscale/utils.c > > index ae92ac9fbc..d4a72d3ce1 100644 > > --- a/libswscale/utils.c > > +++ b/libswscale/utils.c > > @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = { > > [AV_PIX_FMT_P416LE] = { 1, 0 }, > > }; > > > > +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > > +#if ARCH_X86_64 > > +int i, j, k, l; > > +int cpu_flags = av_get_cpu_flags(); > > +if (EXTERNAL_AVX2_FAST(cpu_flags)){ > > +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > > +if (dstW % 16 == 0){ > > +if (filter != NULL){ > > +for (i = 0; i < dstW; i += 8){ > > +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); > > +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); > > +} > > +if (filterSize > 4){ > > +int16_t *tmp2 = av_malloc(dstW * filterSize * > 2); > > +memcpy(tmp2, filter, dstW * filterSize * 2); > > +for (i = 0; i < dstW; i += 16){//pixel > > +for (k = 0; k < filterSize / 4; > ++k){//fcoeff > > +for (j = 0; j < 16; ++j){//inner pixel > > +for (l = 0; l < 4; ++l){//coeff > > +int from = i * filterSize + j * > filterSize + k * 4 + l; > > +int to = (i) * filterSize + j * > 4 + l + k * 64; > > +filter[to] = tmp2[from]; > > +} > > +} > > +} > > +} > > +av_free(tmp2); > > +} > > +} > > +} > > +} > > +} > > +#endif > > +} > > + > > int sws_isSupportedInput(enum AVPixelFormat pix_fmt) > > { > > return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? > > @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > > get_local_pos(c, 0, 0, 0), > > get_local_pos(c, 0, 0, 0))) < 0) > > goto fail; > > +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, > c->hLumFilterSize, c->hLumFilter, dstW); > > if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, > > &c->hChrFilterSize, c->chrXInc, > > c->chrSrcW, c->chrDstW, filterAlign, 1 << > 14, > > @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > > get_local_pos(c, c->chrSrcHSubSample, > c->src_h_chr_pos, 0), > > get_local_pos(c, c->chrDstHSubSample, > c->dst_h_chr_pos, 0))) < 0) > > goto fail; > > +ff_sh
[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.
Fixes so that fate under 64 bit Windows passes. These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available. --- libswscale/swscale_internal.h | 2 + libswscale/utils.c| 37 +++ libswscale/x86/Makefile | 1 + libswscale/x86/scale_avx2.asm | 112 ++ libswscale/x86/swscale.c | 19 ++ tests/checkasm/sw_scale.c | 20 -- 6 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 libswscale/x86/scale_avx2.asm diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 708facba67..64aa0b9804 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, //number of extra lines to process #define MAX_LINES_AHEAD 4 +//shuffle filter and filterPos for hyScale and hcScale filters in avx2 +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index ae92ac9fbc..d4a72d3ce1 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 0 }, }; +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +#if ARCH_X86_64 +int i, j, k, l; +int cpu_flags = av_get_cpu_flags(); +if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ +if (dstW % 16 == 0){ +if (filter != NULL){ +for (i = 0; i < dstW; i += 8){ +FFSWAP(int, filterPos[i + 2], filterPos[i+4]); +FFSWAP(int, filterPos[i + 3], filterPos[i+5]); +} +if (filterSize > 4){ +int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +memcpy(tmp2, filter, dstW * filterSize * 2); +for (i = 0; i < dstW; i += 16){//pixel +for (k = 0; k < filterSize / 4; ++k){//fcoeff +for (j = 0; j < 16; ++j){//inner pixel +for (l = 0; l < 4; ++l){//coeff +int from = i * filterSize + j * filterSize + k * 4 + l; +int to = (i) * filterSize + j * 4 + l + k * 64; +filter[to] = tmp2[from]; +} +} +} +} +av_free(tmp2); +} +} +} +} +} +#endif +} + int sws_isSupportedInput(enum AVPixelFormat pix_fmt) { return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ? @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; +ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); } } // initialize horizontal stuff diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile index bfe383364e..68391494be 100644 --- a/libswscale/x86/Makefile +++ b/libswscale/x86/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o X86ASM-OBJS += x86/input.o \ x86/output.o \ x86/scale.o \ + x86/scale_avx2.o \ x86/rgb_2_rgb.o \ x86/yuv_2_rgb.o \ x86/yuv2yuvX.o \ diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm new file mode 100644 index 00..4e4fe5d794 --- /dev/null +++ b/libswscale/x86/scale_avx2.asm @@ -0,0 +1,112 @@ +;*
[FFmpeg-devel] [PATCH] x86/swscale: fix minor coding style issues
--- libswscale/x86/swscale.c | 14 +++--- tests/checkasm/sw_scale.c | 3 +-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 164b06d6ba..c49a05c37b 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -578,13 +578,13 @@ switch(c->dstBpc){ \ break; \ } -if (EXTERNAL_AVX2_FAST(cpu_flags)){ - if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if(c->chrDstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if(c->dstW % 16 == 0) - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); - } +if (EXTERNAL_AVX2_FAST(cpu_flags)) { +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { +if (c->chrDstW % 16 == 0) +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +if (c->dstW % 16 == 0) +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +} } if (EXTERNAL_AVX2_FAST(cpu_flags)) { diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 011cb46428..f4912e6c2c 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -217,9 +217,8 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if (cpu_flags & AV_CPU_FLAG_AVX2){ +if (cpu_flags & AV_CPU_FLAG_AVX2) ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); -} if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH] x86/swscale: fix minor coding style issues
Thanks Lynne for the patch. On Thu, Dec 16, 2021 at 5:05 PM Alan Kelly wrote: > --- > libswscale/x86/swscale.c | 14 +++--- > tests/checkasm/sw_scale.c | 3 +-- > 2 files changed, 8 insertions(+), 9 deletions(-) > > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 164b06d6ba..c49a05c37b 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -578,13 +578,13 @@ switch(c->dstBpc){ \ > break; \ > } > > -if (EXTERNAL_AVX2_FAST(cpu_flags)){ > - if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > -if(c->chrDstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > -if(c->dstW % 16 == 0) > - ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > - } > +if (EXTERNAL_AVX2_FAST(cpu_flags)) { > +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > +if (c->chrDstW % 16 == 0) > +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > +if (c->dstW % 16 == 0) > +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > +} > } > > if (EXTERNAL_AVX2_FAST(cpu_flags)) { > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index 011cb46428..f4912e6c2c 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -217,9 +217,8 @@ static void check_hscale(void) > } > ff_sws_init_scale(ctx); > memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * > MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > -if (cpu_flags & AV_CPU_FLAG_AVX2){ > +if (cpu_flags & AV_CPU_FLAG_AVX2) > ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, > filterAvx2, SRC_PIXELS); > -} > > if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", > ctx->srcBpc, ctx->dstBpc + 1, width)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > -- > 2.34.1.173.g76aa8bc2d0-goog > > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] x86/scale_avx2: Change asm indent from 2 to 4 spaces.
--- libswscale/x86/scale_avx2.asm | 96 +-- 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 2cd7e968d3..eb472db12f 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -45,63 +45,63 @@ SECTION .text %macro SCALE_FUNC 1 cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, count, inner - pxor m0, m0 - mova m15, [swizzle] - mov countq, $0 - movsxd wq, wd +pxor m0, m0 +mova m15, [swizzle] +mov countq, $0 +movsxd wq, wd %ifidn %1, X4 - mova m14, [four] - shr fltsized, 2 +mova m14, [four] +shr fltsized, 2 %endif .loop: - movu m1, [fltposq] - movu m2, [fltposq+32] +movu m1, [fltposq] +movu m2, [fltposq+32] %ifidn %1, X4 - pxor m9, m9 - pxor m10, m10 - pxor m11, m11 - pxor m12, m12 - mov innerq, $0 +pxor m9, m9 +pxor m10, m10 +pxor m11, m11 +pxor m12, m12 +mov innerq, $0 .innerloop: %endif - vpcmpeqd m13, m13 - vpgatherdd m3,[srcmemq + m1], m13 - vpcmpeqd m13, m13 - vpgatherdd m4,[srcmemq + m2], m13 - vpunpcklbw m5, m3, m0 - vpunpckhbw m6, m3, m0 - vpunpcklbw m7, m4, m0 - vpunpckhbw m8, m4, m0 - vpmaddwd m5, m5, [filterq] - vpmaddwd m6, m6, [filterq + 32] - vpmaddwd m7, m7, [filterq + 64] - vpmaddwd m8, m8, [filterq + 96] - add filterq, $80 +vpcmpeqd m13, m13 +vpgatherdd m3,[srcmemq + m1], m13 +vpcmpeqd m13, m13 +vpgatherdd m4,[srcmemq + m2], m13 +vpunpcklbw m5, m3, m0 +vpunpckhbw m6, m3, m0 +vpunpcklbw m7, m4, m0 +vpunpckhbw m8, m4, m0 +vpmaddwd m5, m5, [filterq] +vpmaddwd m6, m6, [filterq + 32] +vpmaddwd m7, m7, [filterq + 64] +vpmaddwd m8, m8, [filterq + 96] +add filterq, $80 %ifidn %1, X4 - paddd m9, m5 - paddd m10, m6 - paddd m11, m7 - paddd m12, m8 - paddd m1, m14 - paddd m2, m14 - add innerq, $1 - cmp innerq, fltsizeq - jl .innerloop - vphaddd m5, m9, m10 - vphaddd m6, m11, m12 +paddd m9, m5 +paddd m10, m6 +paddd m11, m7 +paddd m12, m8 +paddd m1, m14 +paddd m2, m14 +add innerq, $1 +cmp innerq, fltsizeq +jl .innerloop +vphaddd m5, m9, m10 +vphaddd m6, m11, m12 %else - vphaddd m5, m5, m6 - vphaddd m6, m7, m8 +vphaddd m5, m5, m6 +vphaddd m6, m7, m8 %endif - vpsrad m5, 7 - vpsrad m6, 7 - vpackssdw m5, m5, m6 - vpermd m5, m15, m5 - vmovdqu [dstq + countq * 2], m5 - add fltposq, $40 - add countq, $10 - cmp countq, wq - jl .loop +vpsrad m5, 7 +vpsrad m6, 7 +vpackssdw m5, m5, m6 +vpermd m5, m15, m5 +vmovdqu [dstq + countq * 2], m5 +add fltposq, $40 +add countq, $10 +cmp countq, wq +jl .loop REP_RET %endmacro -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
This flag is set on Haswell and earlier and all AMD cpus. --- As discussed on IRC last week. libavutil/cpu.h | 57 +++-- libavutil/x86/cpu.c | 13 ++- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index ae443eccad..4272d11d73 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -26,34 +26,35 @@ #define AV_CPU_FLAG_FORCE0x8000 /* force usage of selected flags (OR) */ /* lower 16 bits - CPU features */ -#define AV_CPU_FLAG_MMX 0x0001 ///< standard MMX -#define AV_CPU_FLAG_MMXEXT 0x0002 ///< SSE integer functions or AMD MMX ext -#define AV_CPU_FLAG_MMX2 0x0002 ///< SSE integer functions or AMD MMX ext -#define AV_CPU_FLAG_3DNOW0x0004 ///< AMD 3DNOW -#define AV_CPU_FLAG_SSE 0x0008 ///< SSE functions -#define AV_CPU_FLAG_SSE2 0x0010 ///< PIV SSE2 functions -#define AV_CPU_FLAG_SSE2SLOW 0x4000 ///< SSE2 supported, but usually not faster -///< than regular MMX/SSE (e.g. Core1) -#define AV_CPU_FLAG_3DNOWEXT 0x0020 ///< AMD 3DNowExt -#define AV_CPU_FLAG_SSE3 0x0040 ///< Prescott SSE3 functions -#define AV_CPU_FLAG_SSE3SLOW 0x2000 ///< SSE3 supported, but usually not faster -///< than regular MMX/SSE (e.g. Core1) -#define AV_CPU_FLAG_SSSE30x0080 ///< Conroe SSSE3 functions -#define AV_CPU_FLAG_SSSE3SLOW 0x400 ///< SSSE3 supported, but usually not faster -#define AV_CPU_FLAG_ATOM 0x1000 ///< Atom processor, some SSSE3 instructions are slower -#define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions -#define AV_CPU_FLAG_SSE420x0200 ///< Nehalem SSE4.2 functions -#define AV_CPU_FLAG_AESNI 0x8 ///< Advanced Encryption Standard functions -#define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used -#define AV_CPU_FLAG_AVXSLOW 0x800 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer) -#define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions -#define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions -#define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction -#define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used -#define AV_CPU_FLAG_FMA30x1 ///< Haswell FMA3 functions -#define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 -#define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 -#define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used +#define AV_CPU_FLAG_MMX 0x0001 ///< standard MMX +#define AV_CPU_FLAG_MMXEXT 0x0002 ///< SSE integer functions or AMD MMX ext +#define AV_CPU_FLAG_MMX20x0002 ///< SSE integer functions or AMD MMX ext +#define AV_CPU_FLAG_3DNOW 0x0004 ///< AMD 3DNOW +#define AV_CPU_FLAG_SSE 0x0008 ///< SSE functions +#define AV_CPU_FLAG_SSE20x0010 ///< PIV SSE2 functions +#define AV_CPU_FLAG_SSE2SLOW0x4000 ///< SSE2 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) +#define AV_CPU_FLAG_3DNOWEXT0x0020 ///< AMD 3DNowExt +#define AV_CPU_FLAG_SSE30x0040 ///< Prescott SSE3 functions +#define AV_CPU_FLAG_SSE3SLOW0x2000 ///< SSE3 supported, but usually not faster + ///< than regular MMX/SSE (e.g. Core1) +#define AV_CPU_FLAG_SSSE3 0x0080 ///< Conroe SSSE3 functions +#define AV_CPU_FLAG_SSSE3SLOW0x400 ///< SSSE3 supported, but usually not faster +#define AV_CPU_FLAG_ATOM0x1000 ///< Atom processor, some SSSE3 instructions are slower +#define AV_CPU_FLAG_SSE40x0100 ///< Penryn SSE4.1 functions +#define AV_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions +#define AV_CPU_FLAG_AESNI 0x8 ///< Advanced Encryption Standard functions +#define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used +#define AV_CPU_FLAG_AVXSLOW 0x800 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer) +#define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions +#define AV_CPU_FLAG_FMA40x0800 ///< Bulldozer FMA4 functions +#define AV_CPU_FLAG_CMOV0x1000 ///< supports cmov instruction +#define AV_CPU_FLAG_AVX20x8000 ///< AVX2 functions: requires OS support even if YMM registers aren't used +#define AV_CPU_FLAG_FMA3 0x1 ///< Haswell FMA3 functions +#define AV_CPU_FLAG_BMI1 0x2 ///< Bit Manipulation Instruction Set 1 +#define AV_CPU_FLAG_BMI2 0x4 ///< Bit Manipulation
[FFmpeg-devel] [PATCH 2/2] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.
This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions are only used where they are faster. --- libswscale/utils.c| 2 +- libswscale/x86/swscale.c | 2 +- tests/checkasm/sw_scale.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index d4a72d3ce1..9a69b45afe 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz #if ARCH_X86_64 int i, j, k, l; int cpu_flags = av_get_cpu_flags(); -if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ if (dstW % 16 == 0){ if (filter != NULL){ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index c49a05c37b..eb5334a2be 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -578,7 +578,7 @@ switch(c->dstBpc){ \ break; \ } -if (EXTERNAL_AVX2_FAST(cpu_flags)) { +if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { if (c->chrDstW % 16 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index f4912e6c2c..680562af08 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -217,7 +217,7 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if (cpu_flags & AV_CPU_FLAG_AVX2) +if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER) ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
This flag is set on Haswell and earlier and all AMD cpus. --- Removes unnecessary indentation, clarifies comment and only sets flag on AMD cpus with AVX2. libavutil/cpu.h | 1 + libavutil/x86/cpu.c | 14 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index ae443eccad..ce9bf14bf7 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -54,6 +54,7 @@ #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used +#define AV_CPU_FLAG_SLOW_GATHER 0x200 ///< CPU has slow gathers. #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard #define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..563984f234 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) { rval |= AV_CPU_FLAG_AVX2; +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +/* Haswell has slow gather */ +if(family == 6 && model < 70) +rval |= AV_CPU_FLAG_SLOW_GATHER; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void) used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) rval |= AV_CPU_FLAG_AVXSLOW; + +/* AMD cpus have slow gather */ +if(rval & AV_CPU_FLAG_AVX2) +rval |= AV_CPU_FLAG_SLOW_GATHER; } /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/2] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.
This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions are only used where they are faster. --- Whoops! Corrects check so that this flag is only enabled where fast avx2 and fast gathers are available. libswscale/utils.c| 2 +- libswscale/x86/swscale.c | 2 +- tests/checkasm/sw_scale.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index d4a72d3ce1..7158384f0b 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz #if ARCH_X86_64 int i, j, k, l; int cpu_flags = av_get_cpu_flags(); -if (EXTERNAL_AVX2_FAST(cpu_flags)){ +if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ if (dstW % 16 == 0){ if (filter != NULL){ diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index c49a05c37b..ffc7691c12 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -578,7 +578,7 @@ switch(c->dstBpc){ \ break; \ } -if (EXTERNAL_AVX2_FAST(cpu_flags)) { +if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { if (c->chrDstW % 16 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index f4912e6c2c..3c0a083b42 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -217,7 +217,7 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if (cpu_flags & AV_CPU_FLAG_AVX2) +if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
This flag is set on Haswell and earlier and all AMD cpus. --- Sets this flag on Zen 3 and earlier. libavutil/cpu.h | 1 + libavutil/x86/cpu.c | 14 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index ae443eccad..ce9bf14bf7 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -54,6 +54,7 @@ #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used +#define AV_CPU_FLAG_SLOW_GATHER 0x200 ///< CPU has slow gathers. #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard #define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..11467ba99d 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) { rval |= AV_CPU_FLAG_AVX2; +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +/* Haswell has slow gather */ +if(family == 6 && model < 70) +rval |= AV_CPU_FLAG_SLOW_GATHER; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void) used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) rval |= AV_CPU_FLAG_AVXSLOW; + +/* Zen 3 and earlier have slow gather */ +if((rval & AV_CPU_FLAG_AVX2) & family <= 25) +rval |= AV_CPU_FLAG_SLOW_GATHER; } /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
On Mon, Dec 20, 2021 at 3:53 PM James Almer wrote: > > > On 12/20/2021 11:47 AM, Lynne wrote: > > 20 Dec 2021, 15:43 by alankelly-at-google@ffmpeg.org: > > > >> This flag is set on Haswell and earlier and all AMD cpus. > >> --- > >> Removes unnecessary indentation, clarifies comment and only sets flag > on AMD > >> cpus with AVX2. > >> libavutil/cpu.h | 1 + > >> libavutil/x86/cpu.c | 14 +- > >> 2 files changed, 14 insertions(+), 1 deletion(-) > >> > >> diff --git a/libavutil/cpu.h b/libavutil/cpu.h > >> index ae443eccad..ce9bf14bf7 100644 > >> --- a/libavutil/cpu.h > >> +++ b/libavutil/cpu.h > >> @@ -54,6 +54,7 @@ > >> #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation > Instruction Set 1 > >> #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation > Instruction Set 2 > >> #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: > requires OS support even if YMM/ZMM registers aren't used > >> +#define AV_CPU_FLAG_SLOW_GATHER 0x200 ///< CPU has slow gathers. > >> > >> #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard > >> #define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 > >> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c > >> index bcd41a50a2..563984f234 100644 > >> --- a/libavutil/x86/cpu.c > >> +++ b/libavutil/x86/cpu.c > >> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void) > >> if (max_std_level >= 7) { > >> cpuid(7, eax, ebx, ecx, edx); > >> #if HAVE_AVX2 > >> -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) > >> +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) { > >> rval |= AV_CPU_FLAG_AVX2; > >> +cpuid(1, eax, ebx, ecx, std_caps); > >> +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); > >> +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); > >> +/* Haswell has slow gather */ > >> +if(family == 6 && model < 70) > >> +rval |= AV_CPU_FLAG_SLOW_GATHER; > >> +} > >> + > >> #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ > >> if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ > >> if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) > >> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void) > >> used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */ > >> if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX)) > >> rval |= AV_CPU_FLAG_AVXSLOW; > >> + > >> +/* AMD cpus have slow gather */ > >> +if(rval & AV_CPU_FLAG_AVX2) > >> +rval |= AV_CPU_FLAG_SLOW_GATHER; > >> } > >> > > > > No, I'd rather limit AMD CPUs to all currently released CPUs. > > Future ones are getting AVX512, which did speed up gathers on > > Intel CPUs, as the ISA extension extended gathers and addded > > scatters. > > I wouldn't hold my breath for that, but it's probably a good idea > anyway. A check so it's flagged only on Excavator and Zen <= 3. > > > > > Also your previous patch introduces ff_shuffle_filter_coefficients() > > which is so bad it pretty much needs a complete rewrite. > > You're also not detecting malloc errors or propagating them back. > > That's unrelated to this patch. > > > > > ___ > > ffmpeg-devel mailing list > > ffmpeg-devel@ffmpeg.org > > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > > > To unsubscribe, visit link above, or email > > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > Updated patch sent with check for family <= 25 so that future CPUs will have avx2 hscale enabled by default. I may have time this week to look at ff_shuffle_filter_coefficients. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.
This flag is set on Haswell and earlier and all AMD cpus. --- Checks for family for Haswell. All checks are done where AVX2 flag is set as this is clearer. libavutil/cpu.h | 1 + libavutil/x86/cpu.c | 15 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/libavutil/cpu.h b/libavutil/cpu.h index ae443eccad..ce9bf14bf7 100644 --- a/libavutil/cpu.h +++ b/libavutil/cpu.h @@ -54,6 +54,7 @@ #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used +#define AV_CPU_FLAG_SLOW_GATHER 0x200 ///< CPU has slow gathers. #define AV_CPU_FLAG_ALTIVEC 0x0001 ///< standard #define AV_CPU_FLAG_VSX 0x0002 ///< ISA 2.06 diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c index bcd41a50a2..441b4695d5 100644 --- a/libavutil/x86/cpu.c +++ b/libavutil/x86/cpu.c @@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void) if (max_std_level >= 7) { cpuid(7, eax, ebx, ecx, edx); #if HAVE_AVX2 -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) { rval |= AV_CPU_FLAG_AVX2; +cpuid(1, eax, ebx, ecx, std_caps); +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); +model = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0); +/* Haswell has slow gather */ +if (!strncmp(vendor.c, "GenuineIntel", 12)) +if (family == 6 && model < 70) +rval |= AV_CPU_FLAG_SLOW_GATHER; +/* Zen 3 and earlier have slow gather */ +if (!strncmp(vendor.c, "AuthenticAMD", 12)) +if (family <= 0x19) +rval |= AV_CPU_FLAG_SLOW_GATHER; +} + #if HAVE_AVX512 /* F, CD, BW, DQ, VL */ if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */ if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003) -- 2.34.1.173.g76aa8bc2d0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.
Make the code more readable, follow the style guide and propagate memory allocation errors. --- libswscale/swscale_internal.h | 2 +- libswscale/utils.c| 68 --- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 3a78d95ba6..26d28d42e6 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, #define MAX_LINES_AHEAD 4 //shuffle filter and filterPos for hyScale and hcScale filters in avx2 -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index c5ea8853d5..52f07e1661 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, + int filterSize, int16_t *filter, + int dstW) +{ #if ARCH_X86_64 -int i, j, k, l; +int i = 0, j = 0, k = 0; int cpu_flags = av_get_cpu_flags(); +if (!filter || dstW % 16 != 0) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if (dstW % 16 == 0){ -if (filter != NULL){ -for (i = 0; i < dstW; i += 8){ -FFSWAP(int, filterPos[i + 2], filterPos[i+4]); -FFSWAP(int, filterPos[i + 3], filterPos[i+5]); -} -if (filterSize > 4){ -int16_t *tmp2 = av_malloc(dstW * filterSize * 2); -memcpy(tmp2, filter, dstW * filterSize * 2); -for (i = 0; i < dstW; i += 16){//pixel -for (k = 0; k < filterSize / 4; ++k){//fcoeff -for (j = 0; j < 16; ++j){//inner pixel -for (l = 0; l < 4; ++l){//coeff -int from = i * filterSize + j * filterSize + k * 4 + l; -int to = (i) * filterSize + j * 4 + l + k * 64; -filter[to] = tmp2[from]; -} -} -} -} -av_free(tmp2); -} -} -} +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { + int16_t *filterCopy = NULL; + if (filterSize > 4) { + if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) + return AVERROR(ENOMEM); + memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t)); + } + // Do not swap filterPos for pixels which won't be processed by + // the main loop. + for (i = 0; i + 8 <= dstW; i += 8) { + FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); + FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + } + if (filterSize > 4) { + // 16 pixels are processed at a time. + for (i = 0; i + 16 <= dstW; i += 16) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 16; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 16; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } + } + if (filterCopy) + av_free(filterCopy); } } #endif +return 0; } int sws_isSupportedInput(enum AVPixelFormat pix_fmt) @@ -1836,7 +1844,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); +if ((ret = ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW)) != 0) +goto nomem; if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
[FFmpeg-devel] [PATCH 2/4] libswscale: Avx2 hscale can process any input of size which is a multiple of 4.
The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. --- libswscale/x86/scale_avx2.asm | 48 +-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..dc42abb100 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif +cmp wq, 16 +jl .tail_loop +mov countq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, vpsrad m6, 7 vpackssdw m5, m5, m6 vpermd m5, m15, m5 -vmovdqu [dstq + countq * 2], m5 +vmovdqu [dstq], m5 +add dstq, 0x20 add fltposq, 0x40 add countq, 0x10 cmp countq, wq -jl .loop +jle .loop + +sub countq, 0x10 +cmp countq, wq +jge .end + +.tail_loop: +movu xm1, [fltposq] +%ifidn %1, X4 +pxor xm9, xm9 +pxor xm10, xm10 +xor innerq, innerq +.tail_innerloop: +%endif +vpcmpeqd xm13, xm13 +vpgatherdd xm3,[srcmemq + xm1], xm13 +vpunpcklbw xm5, xm3, xm0 +vpunpckhbw xm6, xm3, xm0 +vpmaddwd xm5, xm5, [filterq] +vpmaddwd xm6, xm6, [filterq + 16] +add filterq, 0x20 +%ifidn %1, X4 +paddd xm9, xm5 +paddd xm10, xm6 +paddd xm1, xm14 +add innerq, 1 +cmp innerq, fltsizeq +jl .tail_innerloop +vphaddd xm5, xm9, xm10 +%else +vphaddd xm5, xm5, xm6 +%endif +vpsrad xm5, 7 +vpackssdw xm5, xm5, xm5 +vmovq [dstq], xm5 +add dstq, 0x8 +add fltposq, 0x10 +add countq, 0x4 +cmp countq, wq +jl .tail_loop +.end: REP_RET %endmacro -- 2.34.1.575.g55b058a8bb-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/4] libswscale: Enable hscale_avx2 for input sizes which ar emultiples of 4.
ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 17 +++-- libswscale/x86/swscale.c | 4 ++-- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 52f07e1661..7e1e9c3834 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -285,7 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i = 0, j = 0, k = 0; int cpu_flags = av_get_cpu_flags(); -if (!filter || dstW % 16 != 0) return 0; +if (!filter || (dstW % 4 != 0)) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { int16_t *filterCopy = NULL; @@ -296,9 +296,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -312,6 +314,17 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i + 4 <= dstW; i += 4) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 4; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } if (filterCopy) av_free(filterCopy); diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index fdc93866a6..1d8f19aa5a 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -580,9 +580,9 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { -if (c->chrDstW % 16 == 0) +if (c->chrDstW % 4 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if (c->dstW % 16 == 0) +if (c->dstW % 4 == 0) ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } -- 2.34.1.575.g55b058a8bb-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 4/4] checkasm/sw_scale: hscale does not requires cpuflag test.
This is done in ff_shuffle_filter_coefficients. --- tests/checkasm/sw_scale.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 3c0a083b42..e7f916d3a8 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -168,8 +168,6 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); -int cpu_flags = av_get_cpu_flags(); - ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -215,10 +213,10 @@ static void check_hscale(void) filter[SRC_PIXELS * width + i] = rnd(); } + ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.34.1.575.g55b058a8bb-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.
Hi, Is anybody interested in this patch set? Thanks! On Mon, Jan 10, 2022, 15:58 Alan Kelly wrote: > Make the code more readable, follow the style guide and propagate memory > allocation errors. > --- > libswscale/swscale_internal.h | 2 +- > libswscale/utils.c| 68 --- > 2 files changed, 40 insertions(+), 30 deletions(-) > > diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h > index 3a78d95ba6..26d28d42e6 100644 > --- a/libswscale/swscale_internal.h > +++ b/libswscale/swscale_internal.h > @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int > threadnr, > #define MAX_LINES_AHEAD 4 > > //shuffle filter and filterPos for hyScale and hcScale filters in avx2 > -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > diff --git a/libswscale/utils.c b/libswscale/utils.c > index c5ea8853d5..52f07e1661 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = { > [AV_PIX_FMT_P416LE] = { 1, 1 }, > }; > > -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, > + int filterSize, int16_t *filter, > + int dstW) > +{ > #if ARCH_X86_64 > -int i, j, k, l; > +int i = 0, j = 0, k = 0; > int cpu_flags = av_get_cpu_flags(); > +if (!filter || dstW % 16 != 0) return 0; > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ > -if (dstW % 16 == 0){ > -if (filter != NULL){ > -for (i = 0; i < dstW; i += 8){ > -FFSWAP(int, filterPos[i + 2], filterPos[i+4]); > -FFSWAP(int, filterPos[i + 3], filterPos[i+5]); > -} > -if (filterSize > 4){ > -int16_t *tmp2 = av_malloc(dstW * filterSize * 2); > -memcpy(tmp2, filter, dstW * filterSize * 2); > -for (i = 0; i < dstW; i += 16){//pixel > -for (k = 0; k < filterSize / 4; ++k){//fcoeff > -for (j = 0; j < 16; ++j){//inner pixel > -for (l = 0; l < 4; ++l){//coeff > -int from = i * filterSize + j * > filterSize + k * 4 + l; > -int to = (i) * filterSize + j * 4 > + l + k * 64; > -filter[to] = tmp2[from]; > -} > -} > -} > -} > -av_free(tmp2); > -} > -} > -} > +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > + int16_t *filterCopy = NULL; > + if (filterSize > 4) { > + if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) > + return AVERROR(ENOMEM); > + memcpy(filterCopy, filter, dstW * filterSize * > sizeof(int16_t)); > + } > + // Do not swap filterPos for pixels which won't be processed by > + // the main loop. > + for (i = 0; i + 8 <= dstW; i += 8) { > + FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); > + FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); > + } > + if (filterSize > 4) { > + // 16 pixels are processed at a time. > + for (i = 0; i + 16 <= dstW; i += 16) { > + // 4 filter coeffs are processed at a time. > + for (k = 0; k + 4 <= filterSize; k += 4) { > + for (j = 0; j < 16; ++j) { > + int from = (i + j) * filterSize + k; > + int to = i * filterSize + j * 4 + k * 16; > + memcpy(&filter[to], &filterCopy[from], 4 * > sizeof(int16_t)); > + } > + } > + } > + } > + if (filterCopy) &g
[FFmpeg-devel] [PATCH 1/5] libswscale: Re-factor ff_shuffle_filter_coefficients.
Make the code more readable and follow the style guide. --- libswscale/utils.c | 64 +++--- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index c5ea8853d5..1d919e863a 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,39 +278,49 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, + int filterSize, int16_t *filter, + int dstW) +{ #if ARCH_X86_64 -int i, j, k, l; +int i, j, k; int cpu_flags = av_get_cpu_flags(); +// avx2 hscale filter processes 16 pixel blocks. +if (!filter || dstW % 16 != 0) +return; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if (dstW % 16 == 0){ -if (filter != NULL){ -for (i = 0; i < dstW; i += 8){ -FFSWAP(int, filterPos[i + 2], filterPos[i+4]); -FFSWAP(int, filterPos[i + 3], filterPos[i+5]); -} -if (filterSize > 4){ -int16_t *tmp2 = av_malloc(dstW * filterSize * 2); -memcpy(tmp2, filter, dstW * filterSize * 2); -for (i = 0; i < dstW; i += 16){//pixel -for (k = 0; k < filterSize / 4; ++k){//fcoeff -for (j = 0; j < 16; ++j){//inner pixel -for (l = 0; l < 4; ++l){//coeff -int from = i * filterSize + j * filterSize + k * 4 + l; -int to = (i) * filterSize + j * 4 + l + k * 64; -filter[to] = tmp2[from]; -} -} -} -} -av_free(tmp2); -} -} -} +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { + int16_t *filterCopy = NULL; + if (filterSize > 4) { + if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) + return; + memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t)); + } + // Do not swap filterPos for pixels which won't be processed by + // the main loop. + for (i = 0; i + 8 <= dstW; i += 8) { + FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); + FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + } + if (filterSize > 4) { + // 16 pixels are processed at a time. + for (i = 0; i + 16 <= dstW; i += 16) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 16; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 16; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } + } + if (filterCopy) + av_free(filterCopy); } } #endif +return; } int sws_isSupportedInput(enum AVPixelFormat pix_fmt) -- 2.35.0.263.gb82422642f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/5] libswscale: Avx2 hscale can process inputs of any size.
The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. --- libswscale/x86/scale_avx2.asm | 48 +-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..dc42abb100 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif +cmp wq, 16 +jl .tail_loop +mov countq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, vpsrad m6, 7 vpackssdw m5, m5, m6 vpermd m5, m15, m5 -vmovdqu [dstq + countq * 2], m5 +vmovdqu [dstq], m5 +add dstq, 0x20 add fltposq, 0x40 add countq, 0x10 cmp countq, wq -jl .loop +jle .loop + +sub countq, 0x10 +cmp countq, wq +jge .end + +.tail_loop: +movu xm1, [fltposq] +%ifidn %1, X4 +pxor xm9, xm9 +pxor xm10, xm10 +xor innerq, innerq +.tail_innerloop: +%endif +vpcmpeqd xm13, xm13 +vpgatherdd xm3,[srcmemq + xm1], xm13 +vpunpcklbw xm5, xm3, xm0 +vpunpckhbw xm6, xm3, xm0 +vpmaddwd xm5, xm5, [filterq] +vpmaddwd xm6, xm6, [filterq + 16] +add filterq, 0x20 +%ifidn %1, X4 +paddd xm9, xm5 +paddd xm10, xm6 +paddd xm1, xm14 +add innerq, 1 +cmp innerq, fltsizeq +jl .tail_innerloop +vphaddd xm5, xm9, xm10 +%else +vphaddd xm5, xm5, xm6 +%endif +vpsrad xm5, 7 +vpackssdw xm5, xm5, xm5 +vmovq [dstq], xm5 +add dstq, 0x8 +add fltposq, 0x10 +add countq, 0x4 +cmp countq, wq +jl .tail_loop +.end: REP_RET %endmacro -- 2.35.0.263.gb82422642f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/5] libswscale: Enable hscale_avx2 for all input sizes.
ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 19 --- libswscale/x86/swscale.c | 6 ++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 1d919e863a..31c365fcee 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -285,8 +285,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); -// avx2 hscale filter processes 16 pixel blocks. -if (!filter || dstW % 16 != 0) +if (!filter) return; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -298,9 +297,11 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -314,6 +315,18 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } if (filterCopy) av_free(filterCopy); diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 73869355b8..76f5a70fc5 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { -if (c->chrDstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if (c->dstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } -- 2.35.0.263.gb82422642f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 4/5] libswscale: Propagate error codes from ff_shuffle_filter_coefficients
--- libswscale/swscale_internal.h | 2 +- libswscale/utils.c| 14 -- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 3a78d95ba6..26d28d42e6 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, #define MAX_LINES_AHEAD 4 //shuffle filter and filterPos for hyScale and hcScale filters in avx2 -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index 31c365fcee..1f8705a417 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW) { @@ -286,13 +286,13 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int i, j, k; int cpu_flags = av_get_cpu_flags(); if (!filter) -return; +return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { int16_t *filterCopy = NULL; if (filterSize > 4) { if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) - return; + return AVERROR(ENOMEM); memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t)); } // Do not swap filterPos for pixels which won't be processed by @@ -333,7 +333,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } #endif -return; +return 0; } int sws_isSupportedInput(enum AVPixelFormat pix_fmt) @@ -1859,7 +1859,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); +if ((ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW)) < 0) +goto nomem; if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1869,7 +1870,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); +if ((ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW)) < 0) +goto nomem; } } // initialize horizontal stuff -- 2.35.0.263.gb82422642f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.
This is done in ff_shuffle_filter_coefficients. --- tests/checkasm/sw_scale.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 3c0a083b42..e7f916d3a8 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -168,8 +168,6 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); -int cpu_flags = av_get_cpu_flags(); - ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -215,10 +213,10 @@ static void check_hscale(void) filter[SRC_PIXELS * width + i] = rnd(); } + ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.35.0.263.gb82422642f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.
Hi Michael, Thanks for your feedback. I have updated the patches and split this patch into two, one with cosmetic fixes and one propagating the errors. Since there is now an extra patch in the set and the commit messages have changed, new threads have been started. Alan On Thu, Feb 3, 2022 at 3:11 PM Michael Niedermayer wrote: > On Mon, Jan 10, 2022 at 03:58:33PM +0100, Alan Kelly wrote: > > Make the code more readable, follow the style guide and propagate memory > > allocation errors. > > Cosmetics and bugfixes should not be in the same patch > > > > --- > > libswscale/swscale_internal.h | 2 +- > > libswscale/utils.c| 68 --- > > 2 files changed, 40 insertions(+), 30 deletions(-) > > > > diff --git a/libswscale/swscale_internal.h > b/libswscale/swscale_internal.h > > index 3a78d95ba6..26d28d42e6 100644 > > --- a/libswscale/swscale_internal.h > > +++ b/libswscale/swscale_internal.h > > @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, > int threadnr, > > #define MAX_LINES_AHEAD 4 > > > > //shuffle filter and filterPos for hyScale and hcScale filters in avx2 > > -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > > +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int > filterSize, int16_t *filter, int dstW); > > #endif /* SWSCALE_SWSCALE_INTERNAL_H */ > > diff --git a/libswscale/utils.c b/libswscale/utils.c > > index c5ea8853d5..52f07e1661 100644 > > --- a/libswscale/utils.c > > +++ b/libswscale/utils.c > > @@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = { > > [AV_PIX_FMT_P416LE] = { 1, 1 }, > > }; > > > > -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int > filterSize, int16_t *filter, int dstW){ > > +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, > > + int filterSize, int16_t *filter, > > + int dstW) > > +{ > > #if ARCH_X86_64 > > > -int i, j, k, l; > > +int i = 0, j = 0, k = 0; > > why? > they are set when used if iam not mistaken > > > > int cpu_flags = av_get_cpu_flags(); > > > +if (!filter || dstW % 16 != 0) return 0; > > please add \n also a comment what the dstW & 16 case exactly does and why > > > [...] > > int sws_isSupportedInput(enum AVPixelFormat pix_fmt) > > @@ -1836,7 +1844,8 @@ av_cold int sws_init_context(SwsContext *c, > SwsFilter *srcFilter, > > get_local_pos(c, 0, 0, 0), > > get_local_pos(c, 0, 0, 0))) < 0) > > goto fail; > > -ff_shuffle_filter_coefficients(c, c->hLumFilterPos, > c->hLumFilterSize, c->hLumFilter, dstW); > > +if ((ret = ff_shuffle_filter_coefficients(c, > c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW)) != 0) > > +goto nomem; > > This is confusing as ret is never used, also error codes are <0 > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Those who are best at talking, realize last or never when they are wrong. > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 1/5] libswscale: Check and propagate memory allocation errors from ff_shuffle_filter_coefficients.
--- libswscale/swscale_internal.h | 2 +- libswscale/utils.c| 11 --- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h index 3a78d95ba6..26d28d42e6 100644 --- a/libswscale/swscale_internal.h +++ b/libswscale/swscale_internal.h @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int threadnr, #define MAX_LINES_AHEAD 4 //shuffle filter and filterPos for hyScale and hcScale filters in avx2 -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int filterSize, int16_t *filter, int dstW); #endif /* SWSCALE_SWSCALE_INTERNAL_H */ diff --git a/libswscale/utils.c b/libswscale/utils.c index c5ea8853d5..344c87dfdf 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ #if ARCH_X86_64 int i, j, k, l; int cpu_flags = av_get_cpu_flags(); @@ -292,6 +292,8 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz } if (filterSize > 4){ int16_t *tmp2 = av_malloc(dstW * filterSize * 2); +if (!tmp2) +return AVERROR(ENOMEM); memcpy(tmp2, filter, dstW * filterSize * 2); for (i = 0; i < dstW; i += 16){//pixel for (k = 0; k < filterSize / 4; ++k){//fcoeff @@ -310,6 +312,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSiz } } } +return 0; #endif } @@ -1836,7 +1839,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, 0, 0, 0), get_local_pos(c, 0, 0, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW); +if (ff_shuffle_filter_coefficients(c, c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW) < 0) +goto nomem; if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos, &c->hChrFilterSize, c->chrXInc, c->chrSrcW, c->chrDstW, filterAlign, 1 << 14, @@ -1846,7 +1850,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter, get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0), get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0) goto fail; -ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW); +if (ff_shuffle_filter_coefficients(c, c->hChrFilterPos, c->hChrFilterSize, c->hChrFilter, c->chrDstW) < 0) +goto nomem; } } // initialize horizontal stuff -- 2.35.1.265.g69c8d7142f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 2/5] libswscale: Re-factor ff_shuffle_filter_coefficients.
Make the code more readable and follow the style guide. --- libswscale/utils.c | 66 +- 1 file changed, 36 insertions(+), 30 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 344c87dfdf..7c8e1bbdde 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -278,42 +278,48 @@ static const FormatEntry format_entries[] = { [AV_PIX_FMT_P416LE] = { 1, 1 }, }; -int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int filterSize, int16_t *filter, int dstW){ +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, + int filterSize, int16_t *filter, + int dstW) +{ #if ARCH_X86_64 -int i, j, k, l; +int i, j, k; int cpu_flags = av_get_cpu_flags(); +// avx2 hscale filter processes 16 pixel blocks. +if (!filter || dstW % 16 != 0) +return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){ -if (dstW % 16 == 0){ -if (filter != NULL){ -for (i = 0; i < dstW; i += 8){ -FFSWAP(int, filterPos[i + 2], filterPos[i+4]); -FFSWAP(int, filterPos[i + 3], filterPos[i+5]); -} -if (filterSize > 4){ -int16_t *tmp2 = av_malloc(dstW * filterSize * 2); -if (!tmp2) -return AVERROR(ENOMEM); -memcpy(tmp2, filter, dstW * filterSize * 2); -for (i = 0; i < dstW; i += 16){//pixel -for (k = 0; k < filterSize / 4; ++k){//fcoeff -for (j = 0; j < 16; ++j){//inner pixel -for (l = 0; l < 4; ++l){//coeff -int from = i * filterSize + j * filterSize + k * 4 + l; -int to = (i) * filterSize + j * 4 + l + k * 64; -filter[to] = tmp2[from]; -} -} -} -} -av_free(tmp2); -} -} -} +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { + int16_t *filterCopy = NULL; + if (filterSize > 4) { + if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize)) + return AVERROR(ENOMEM); + memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t)); + } + // Do not swap filterPos for pixels which won't be processed by + // the main loop. + for (i = 0; i + 8 <= dstW; i += 8) { + FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); + FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + } + if (filterSize > 4) { + // 16 pixels are processed at a time. + for (i = 0; i + 16 <= dstW; i += 16) { + // 4 filter coeffs are processed at a time. + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < 16; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 16; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } + } + av_free(filterCopy); } } -return 0; #endif +return 0; } int sws_isSupportedInput(enum AVPixelFormat pix_fmt) -- 2.35.1.265.g69c8d7142f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.
The main loop processes blocks of 16 pixels. The tail processes blocks of size 4. --- libswscale/x86/scale_avx2.asm | 48 +-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm index 20acdbd633..dc42abb100 100644 --- a/libswscale/x86/scale_avx2.asm +++ b/libswscale/x86/scale_avx2.asm @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, mova m14, [four] shr fltsized, 2 %endif +cmp wq, 16 +jl .tail_loop +mov countq, 0x10 .loop: movu m1, [fltposq] movu m2, [fltposq+32] @@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, fltsize, vpsrad m6, 7 vpackssdw m5, m5, m6 vpermd m5, m15, m5 -vmovdqu [dstq + countq * 2], m5 +vmovdqu [dstq], m5 +add dstq, 0x20 add fltposq, 0x40 add countq, 0x10 cmp countq, wq -jl .loop +jle .loop + +sub countq, 0x10 +cmp countq, wq +jge .end + +.tail_loop: +movu xm1, [fltposq] +%ifidn %1, X4 +pxor xm9, xm9 +pxor xm10, xm10 +xor innerq, innerq +.tail_innerloop: +%endif +vpcmpeqd xm13, xm13 +vpgatherdd xm3,[srcmemq + xm1], xm13 +vpunpcklbw xm5, xm3, xm0 +vpunpckhbw xm6, xm3, xm0 +vpmaddwd xm5, xm5, [filterq] +vpmaddwd xm6, xm6, [filterq + 16] +add filterq, 0x20 +%ifidn %1, X4 +paddd xm9, xm5 +paddd xm10, xm6 +paddd xm1, xm14 +add innerq, 1 +cmp innerq, fltsizeq +jl .tail_innerloop +vphaddd xm5, xm9, xm10 +%else +vphaddd xm5, xm5, xm6 +%endif +vpsrad xm5, 7 +vpackssdw xm5, xm5, xm5 +vmovq [dstq], xm5 +add dstq, 0x8 +add fltposq, 0x10 +add countq, 0x4 +cmp countq, wq +jl .tail_loop +.end: REP_RET %endmacro -- 2.35.1.265.g69c8d7142f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c | 19 --- libswscale/x86/swscale.c | 6 ++ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index 7c8e1bbdde..d818c9ce55 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); -// avx2 hscale filter processes 16 pixel blocks. -if (!filter || dstW % 16 != 0) +if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 73869355b8..76f5a70fc5 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { -if (c->chrDstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if (c->dstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } -- 2.35.1.265.g69c8d7142f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.
This is done in ff_shuffle_filter_coefficients. --- tests/checkasm/sw_scale.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 3c0a083b42..4c57b6a372 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -168,8 +168,6 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); -int cpu_flags = av_get_cpu_flags(); - ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -217,8 +215,7 @@ static void check_hscale(void) } ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", ctx->srcBpc, ctx->dstBpc + 1, width)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.35.1.265.g69c8d7142f-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.
Hi Michael, Thanks for reviewing the first two parts of this patchset. Is there anybody interested in reviewing this part? Thanks, Alan On Thu, Feb 17, 2022 at 5:21 PM Michael Niedermayer wrote: > On Thu, Feb 17, 2022 at 11:04:04AM +0100, Alan Kelly wrote: > > The main loop processes blocks of 16 pixels. The tail processes blocks > > of size 4. > > --- > > libswscale/x86/scale_avx2.asm | 48 +-- > > 1 file changed, 46 insertions(+), 2 deletions(-) > > ill wait a few days on this, there are people here who know avx2 better > than i do > its a while since i wrote x86 SIMD. > but if noone else reviews this then ill do > > thx > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > "You are 36 times more likely to die in a bathtub than at the hands of a > terrorist. Also, you are 2.5 times more likely to become a president and > 2 times more likely to become an astronaut, than to die in a terrorist > attack." -- Thoughty2 > > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size
--- libswscale/x86/swscale.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index ff16398988..8c67bf4fab 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ return; \ } -#define YUV2YUVX_FUNC(opt, step) \ +#define YUV2YUVX_FUNC(opt, step, tail) \ void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \ uint8_t *dest, int dstW, \ const uint8_t *dither, int offset); \ @@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ if(pixelsProcessed > 0) \ ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ - ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ + yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ } \ return; \ } @@ -220,10 +220,10 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ YUV2YUVX_FUNC_MMX(mmxext, 16) #endif #if HAVE_SSE3_EXTERNAL -YUV2YUVX_FUNC(sse3, 32) +YUV2YUVX_FUNC(sse3, 32, mmxext) #endif #if HAVE_AVX2_EXTERNAL -YUV2YUVX_FUNC(avx2, 64) +YUV2YUVX_FUNC(avx2, 64, sse3) #endif #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ -- 2.41.0.255.g8b1d071c50-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512
--- libswscale/x86/swscale.c| 7 +++ libswscale/x86/yuv2yuvX.asm | 19 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 8c67bf4fab..52423a1199 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32, mmxext) #if HAVE_AVX2_EXTERNAL YUV2YUVX_FUNC(avx2, 64, sse3) #endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +YUV2YUVX_FUNC(avx512, 128, avx2) +#endif #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ @@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_AVX2_EXTERNAL if (EXTERNAL_AVX2_FAST(cpu_flags)) c->yuv2planeX = yuv2yuvX_avx2; +#endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +if (EXTERNAL_AVX512(cpu_flags)) +c->yuv2planeX = yuv2yuvX_avx512; #endif } #if ARCH_X86_32 && !HAVE_ALIGNED_STACK diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 369c850674..57bfa09d66 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -22,6 +22,10 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA 64 + +permutation: dq 0, 2, 4, 6, 1, 3, 5, 7 + SECTION .text ;- @@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 + +%if cpuflag(avx512) +mova m15, [permutation] +%endif cmp offsetd, 0 jz .offset @@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset packuswb m6, m6, m1 %endif mov srcq, [filterq] -%if cpuflag(avx2) +%if cpuflag(avx512) +vpermt2q m3, m15, m3 +vpermt2q m6, m15, m6 +%elif cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif @@ -131,4 +142,10 @@ YUV2YUVX_FUNC %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 YUV2YUVX_FUNC +%if HAVE_AVX512_EXTERNAL +%if ARCH_X86_64 +INIT_ZMM avx512 +YUV2YUVX_FUNC +%endif +%endif %endif -- 2.41.0.255.g8b1d071c50-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.
--- libswscale/x86/swscale.c| 11 --- libswscale/x86/yuv2yuvX.asm | 12 ++-- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 52423a1199..71434f58d3 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -int remainder = (dstW % step); \ -int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -if(pixelsProcessed > 0) \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ -if(remainder > 0){ \ - yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ -} \ +if (dstW >= step) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +else \ +yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 57bfa09d66..ad0e8bd448 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 +mov ditherq, dstWq +sub dstWq, mmsize * unroll %if cpuflag(avx512) mova m15, [permutation] @@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq -jb .outerloop -RET +jb .outerloop + +mov dstWq, offsetq +mov offsetq, ditherq +sub offsetq, mmsize * unroll +cmp dstWq, ditherq +jb .outerloop +REP_RET %endmacro INIT_MMX mmxext -- 2.41.0.255.g8b1d071c50-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512
Happy to add the check. Thanks, Alan On Fri, Jul 14, 2023 at 4:59 PM James Almer wrote: > On 7/14/2023 11:57 AM, Kieran Kunhya wrote: > > On Fri, 14 Jul 2023 at 14:03, James Almer wrote: > > > >> On 7/14/2023 9:59 AM, Kieran Kunhya wrote: > +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL > +if (EXTERNAL_AVX512(cpu_flags)) > +c->yuv2planeX = yuv2yuvX_avx512; > #endif > > >>> > >>>You want EXTERNAL_AVX512ICL here. > >> > >> vpermt2q with zmm registers is avx512f and not any of the extensions, so > >> that check is fine. > >> > > > > We still support Skylake and we don't want downclocking on that platform. > > At least that was my understanding of the intention of AVX512 vs > AVX512ICL. > > It appears I'm the only one following this convention though. > > Ah, no opinion in that regard. I was following the use of the checks in > the strict technical sense of instruction availability. > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512
--- Checks for EXTERNAL_AVX512ICL to prevent downclocking on Skylake libswscale/x86/swscale.c| 7 +++ libswscale/x86/yuv2yuvX.asm | 19 ++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 8c67bf4fab..600c7d6c91 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32, mmxext) #if HAVE_AVX2_EXTERNAL YUV2YUVX_FUNC(avx2, 64, sse3) #endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +YUV2YUVX_FUNC(avx512, 128, avx2) +#endif #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \ void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \ @@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c) #if HAVE_AVX2_EXTERNAL if (EXTERNAL_AVX2_FAST(cpu_flags)) c->yuv2planeX = yuv2yuvX_avx2; +#endif +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL +if (EXTERNAL_AVX512ICL(cpu_flags)) +c->yuv2planeX = yuv2yuvX_avx512; #endif } #if ARCH_X86_32 && !HAVE_ALIGNED_STACK diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 369c850674..57bfa09d66 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -22,6 +22,10 @@ %include "libavutil/x86/x86util.asm" +SECTION_RODATA 64 + +permutation: dq 0, 2, 4, 6, 1, 3, 5, 7 + SECTION .text ;- @@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 + +%if cpuflag(avx512) +mova m15, [permutation] +%endif cmp offsetd, 0 jz .offset @@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset packuswb m6, m6, m1 %endif mov srcq, [filterq] -%if cpuflag(avx2) +%if cpuflag(avx512) +vpermt2q m3, m15, m3 +vpermt2q m6, m15, m6 +%elif cpuflag(avx2) vpermq m3, m3, 216 vpermq m6, m6, 216 %endif @@ -131,4 +142,10 @@ YUV2YUVX_FUNC %if HAVE_AVX2_EXTERNAL INIT_YMM avx2 YUV2YUVX_FUNC +%if HAVE_AVX512_EXTERNAL +%if ARCH_X86_64 +INIT_ZMM avx512 +YUV2YUVX_FUNC +%endif +%endif %endif -- 2.41.0.255.g8b1d071c50-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.
On Sat, Jul 15, 2023 at 10:40 PM Michael Niedermayer wrote: > On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote: > > --- > > libswscale/x86/swscale.c| 11 --- > > libswscale/x86/yuv2yuvX.asm | 12 ++-- > > 2 files changed, 14 insertions(+), 9 deletions(-) > > seems to segfault with > > ./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt > yuv410p -s 199x199 -vstrict -2 -y snow3914-199-410.avi > > Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault. > [Switching to Thread 0x7fffaffef700 (LWP 23533)] > 0x5658a0f6 in ff_yuv2yuvX_sse3 () > (gdb) bt > #0 0x5658a0f6 in ff_yuv2yuvX_sse3 () > #1 0x56585bc6 in chr_planar_vscale () > #2 0x565817d1 in scale_internal () > #3 0x565827d9 in ff_sws_slice_worker () > #4 0x5662b06e in thread_worker () > #5 0x775fc6db in start_thread (arg=0x7fffaffef700) at > pthread_create.c:463 > #6 0x7fffed12861f in clone () at > ../sysdeps/unix/sysv/linux/x86_64/clone.S:95 > (gdb) disassemble $rip-32,$rip+32 > Dump of assembler code from 0x5658a0d6 to 0x5658a116: >0x5658a0d6 :std >0x5658a0d7 :fldenv 0xf(%rsi) >0x5658a0da :outsl %ds:(%rsi),(%dx) >0x5658a0db :sti >0x5658a0dc :psraw $0x4,%xmm7 >0x5658a0e1 :movdqa %xmm7,%xmm4 >0x5658a0e5 : movdqa %xmm7,%xmm3 >0x5658a0e9 : movdqa %xmm7,%xmm6 >0x5658a0ed : movdqa %xmm7,%xmm1 >0x5658a0f1 : movddup 0x8(%rsi),%xmm0 > => 0x5658a0f6 : movdqa (%rdx,%rax,2),%xmm2 >0x5658a0fb : pmulhw %xmm0,%xmm2 >0x5658a0ff : movdqa > 0x10(%rdx,%rax,2),%xmm5 >0x5658a105 : pmulhw %xmm0,%xmm5 >0x5658a109 : paddw %xmm2,%xmm3 >0x5658a10d : paddw %xmm5,%xmm4 >0x5658a111 : movdqa > 0x20(%rdx,%rax,2),%xmm2 > End of assembler dump. > (gdb) info all-registers > rax0x12 18 > rbx0x32 50 > rcx0x57915480 93825029723264 > rdx0x57687680 93825027044992 > rsi0x5758 93825026909784 > rdi0x5758 93825026909784 > rbp0x5765b880 0x5765b880 > rsp0x7fffaffee7a8 0x7fffaffee7a8 > r8 0x20 32 > r9 0x32 50 > r100x56589860 93825009227872 > r110x576f9dc0 93825027513792 > r120x5763b280 93825026732672 > r130x5758 93825026909784 > r140x577b5800 93825028282368 > r150x57622640 93825026631232 > rip0x5658a0f6 0x5658a0f6 > eflags 0x10297 [ CF PF AF SF IF RF ] > cs 0x33 51 > ss 0x2b 43 > ds 0x0 0 > es 0x0 0 > fs 0x0 0 > gs 0x0 0 > st00(raw 0x) > st10(raw 0x) > st20(raw 0x) > st30(raw 0x) > st40(raw 0x) > st50(raw 0x) > st60(raw 0x) > st70(raw 0x) > fctrl 0x 65535 > fstat 0x 65535 > ftag 0x 43690 > fiseg 0x1 1 > fioff 0x0 0 > foseg 0x5646 22086 > fooff 0xa 10 > fop0x7ff2047 > mxcsr 0x1fa8 [ OE PE IM DM ZM OM UM PM ] > > > > > > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > > index 52423a1199..71434f58d3 100644 > > --- a/libswscale/x86/swscale.c > > +++ b/libswscale/x86/swscale.c > > @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, > int filterSize, \ > > const int16_t **src, uint8_t *dest, int > dstW, \ > > const uint8_t *dither, int offset) \ > > { \ > > -int remainder = (dstW % step); \ > > -int pixelsProcessed = dstW - remainder; \ > > if(((uintptr_t)dest) & 15){ \ > > yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, > offset); \ > > return; \ > > } \ > > -if(pixelsProcessed > 0) \ > > -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, > pixelsProcessed + offset, dither, offset); \ &g
[FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.
--- libswscale/x86/swscale.c| 11 --- libswscale/x86/yuv2yuvX.asm | 24 ++-- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 600c7d6c91..6980002e9e 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ const int16_t **src, uint8_t *dest, int dstW, \ const uint8_t *dither, int offset) \ { \ -int remainder = (dstW % step); \ -int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ -if(pixelsProcessed > 0) \ -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ -if(remainder > 0){ \ - yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ -} \ +if (dstW >= step) \ +ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + offset, dither, offset); \ +else \ +yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index 57bfa09d66..03bfd6ad1d 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else movq xm3, [ditherq] %endif ; avx2 +mov ditherq, dstWq +sub dstWq, mmsize * unroll %if cpuflag(avx512) mova m15, [permutation] @@ -92,13 +94,17 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset %else mova m0, [filterSizeq + 8] %endif -pmulhw m2, m0, [srcq + offsetq * 2] -pmulhw m5, m0, [srcq + offsetq * 2 + mmsize] +movu m2, [srcq + offsetq * 2] +movu m5, [srcq + offsetq * 2 + mmsize] +pmulhw m2, m0, m2 +pmulhw m5, m0, m5 paddwm3, m3, m2 paddwm4, m4, m5 %if cpuflag(sse3) -pmulhw m2, m0, [srcq + offsetq * 2 + 2 * mmsize] -pmulhw m5, m0, [srcq + offsetq * 2 + 3 * mmsize] +movu m2, [srcq + offsetq * 2 + 2 * mmsize] +movu m5, [srcq + offsetq * 2 + 3 * mmsize] +pmulhw m2, m0, m2 +pmulhw m5, m0, m5 paddwm6, m6, m2 paddwm1, m1, m5 %endif @@ -131,8 +137,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset add offsetq, mmsize * unroll mov filterSizeq, filterq cmp offsetq, dstWq -jb .outerloop -RET +jb .outerloop + +mov dstWq, offsetq +mov offsetq, ditherq +sub offsetq, mmsize * unroll +cmp dstWq, ditherq +jb .outerloop +REP_RET %endmacro INIT_MMX mmxext -- 2.41.0.255.g8b1d071c50-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.
Hi, Are there any further comments on this patch or can it be committed? Thanks, Alan On Tue, Apr 26, 2022 at 10:00 AM Alan Kelly wrote: > The main loop processes blocks of 16 pixels. The tail processes blocks > of size 4. > --- > libswscale/x86/scale_avx2.asm | 44 ++- > 1 file changed, 43 insertions(+), 1 deletion(-) > > diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm > index 20acdbd633..7657b2825f 100644 > --- a/libswscale/x86/scale_avx2.asm > +++ b/libswscale/x86/scale_avx2.asm > @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, > filter, fltpos, fltsize, > mova m14, [four] > shr fltsized, 2 > %endif > +cmp wq, 16 > +jl .tail_loop > +sub wq, 0x10 > .loop: > movu m1, [fltposq] > movu m2, [fltposq+32] > @@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, > srcmem, filter, fltpos, fltsize, > add fltposq, 0x40 > add countq, 0x10 > cmp countq, wq > -jl .loop > +jle .loop > + > +add wq, 0x10 > +cmp countq, wq > +jge .end > + > +.tail_loop: > +movu xm1, [fltposq] > +%ifidn %1, X4 > +pxor xm9, xm9 > +pxor xm10, xm10 > +xor innerq, innerq > +.tail_innerloop: > +%endif > +vpcmpeqd xm13, xm13 > +vpgatherdd xm3,[srcmemq + xm1], xm13 > +vpunpcklbw xm5, xm3, xm0 > +vpunpckhbw xm6, xm3, xm0 > +vpmaddwd xm5, xm5, [filterq] > +vpmaddwd xm6, xm6, [filterq + 16] > +add filterq, 0x20 > +%ifidn %1, X4 > +paddd xm9, xm5 > +paddd xm10, xm6 > +paddd xm1, xm14 > +add innerq, 1 > +cmp innerq, fltsizeq > +jl .tail_innerloop > +vphaddd xm5, xm9, xm10 > +%else > +vphaddd xm5, xm5, xm6 > +%endif > +vpsrad xm5, 7 > +vpackssdw xm5, xm5, xm5 > +vmovq [dstq + countq * 2], xm5 > +add fltposq, 0x10 > +add countq, 0x4 > +cmp countq, wq > +jl .tail_loop > +.end: > REP_RET > %endmacro > > -- > 2.36.0.rc2.479.g8af0fa9b8e-goog > > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
Pushing this back up to the top. This is required to enable the previous patch in this chain. Thanks On Fri, Apr 22, 2022 at 10:04 AM Alan Kelly wrote: > Ping! > > On Thu, Feb 17, 2022 at 11:04 AM Alan Kelly wrote: > >> ff_shuffle_filter_coefficients shuffles the tail as required. >> --- >> libswscale/utils.c | 19 --- >> libswscale/x86/swscale.c | 6 ++ >> 2 files changed, 18 insertions(+), 7 deletions(-) >> >> diff --git a/libswscale/utils.c b/libswscale/utils.c >> index 7c8e1bbdde..d818c9ce55 100644 >> --- a/libswscale/utils.c >> +++ b/libswscale/utils.c >> @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int >> *filterPos, >> #if ARCH_X86_64 >> int i, j, k; >> int cpu_flags = av_get_cpu_flags(); >> -// avx2 hscale filter processes 16 pixel blocks. >> -if (!filter || dstW % 16 != 0) >> +if (!filter) >> return 0; >> if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & >> AV_CPU_FLAG_SLOW_GATHER)) { >> if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { >> @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, >> int *filterPos, >> } >> // Do not swap filterPos for pixels which won't be processed >> by >> // the main loop. >> - for (i = 0; i + 8 <= dstW; i += 8) { >> + for (i = 0; i + 16 <= dstW; i += 16) { >> FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); >> FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); >> + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); >> + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); >> } >> if (filterSize > 4) { >> // 16 pixels are processed at a time. >> @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, >> int *filterPos, >> } >> } >> } >> + // 4 pixels are processed at a time in the tail. >> + for (; i < dstW; i += 4) { >> + // 4 filter coeffs are processed at a time. >> + int rem = dstW - i >= 4 ? 4 : dstW - i; >> + for (k = 0; k + 4 <= filterSize; k += 4) { >> + for (j = 0; j < rem; ++j) { >> + int from = (i + j) * filterSize + k; >> + int to = i * filterSize + j * 4 + k * 4; >> + memcpy(&filter[to], &filterCopy[from], 4 * >> sizeof(int16_t)); >> + } >> + } >> + } >> } >> av_free(filterCopy); >> } >> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c >> index 73869355b8..76f5a70fc5 100644 >> --- a/libswscale/x86/swscale.c >> +++ b/libswscale/x86/swscale.c >> @@ -691,10 +691,8 @@ switch(c->dstBpc){ \ >> >> if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & >> AV_CPU_FLAG_SLOW_GATHER)) { >> if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { >> -if (c->chrDstW % 16 == 0) >> -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); >> -if (c->dstW % 16 == 0) >> -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); >> +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); >> +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); >> } >> } >> >> -- >> 2.35.1.265.g69c8d7142f-goog >> >> ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
ff_shuffle_filter_coefficients shuffles the tail as required. --- libswscale/utils.c| 19 --- libswscale/x86/swscale.c | 6 ++ tests/checkasm/sw_scale.c | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/libswscale/utils.c b/libswscale/utils.c index cb4f5b521c..544b7fee96 100644 --- a/libswscale/utils.c +++ b/libswscale/utils.c @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, #if ARCH_X86_64 int i, j, k; int cpu_flags = av_get_cpu_flags(); -// avx2 hscale filter processes 16 pixel blocks. -if (!filter || dstW % 16 != 0) +if (!filter) return 0; if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } // Do not swap filterPos for pixels which won't be processed by // the main loop. - for (i = 0; i + 8 <= dstW; i += 8) { + for (i = 0; i + 16 <= dstW; i += 16) { FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); } if (filterSize > 4) { // 16 pixels are processed at a time. @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, } } } + // 4 pixels are processed at a time in the tail. + for (; i < dstW; i += 4) { + // 4 filter coeffs are processed at a time. + int rem = dstW - i >= 4 ? 4 : dstW - i; + for (k = 0; k + 4 <= filterSize; k += 4) { + for (j = 0; j < rem; ++j) { + int from = (i + j) * filterSize + k; + int to = i * filterSize + j * 4 + k * 4; + memcpy(&filter[to], &filterCopy[from], 4 * sizeof(int16_t)); + } + } + } } av_free(filterCopy); } diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 628f12137c..f628c71bd4 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) { if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { -if (c->chrDstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); -if (c->dstW % 16 == 0) -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); } } diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index b643a47c30..798990a6cf 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -223,7 +223,7 @@ static void check_hscale(void) ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, SRC_PIXELS); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.37.0.170.g444d1eabd0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH v2 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.
This is done in ff_shuffle_filter_coefficients. --- tests/checkasm/sw_scale.c | 5 + 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c index 798990a6cf..7be107bef1 100644 --- a/tests/checkasm/sw_scale.c +++ b/tests/checkasm/sw_scale.c @@ -172,8 +172,6 @@ static void check_hscale(void) const uint8_t *src, const int16_t *filter, const int32_t *filterPos, int filterSize); -int cpu_flags = av_get_cpu_flags(); - ctx = sws_alloc_context(); if (sws_init_context(ctx, NULL, NULL) < 0) fail(); @@ -222,8 +220,7 @@ static void check_hscale(void) ctx->dstW = ctx->chrDstW = input_sizes[dstWi]; ff_sws_init_scale(ctx); memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); -if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & AV_CPU_FLAG_SLOW_GATHER)) -ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); +ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, filterAvx2, ctx->dstW); if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) { memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); -- 2.37.0.170.g444d1eabd0-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
Hi Michael, Thanks for looking at this. I fixed the test issue. Alan On Fri, Jul 15, 2022 at 4:59 PM Alan Kelly wrote: > ff_shuffle_filter_coefficients shuffles the tail as required. > --- > libswscale/utils.c| 19 --- > libswscale/x86/swscale.c | 6 ++ > tests/checkasm/sw_scale.c | 2 +- > 3 files changed, 19 insertions(+), 8 deletions(-) > > diff --git a/libswscale/utils.c b/libswscale/utils.c > index cb4f5b521c..544b7fee96 100644 > --- a/libswscale/utils.c > +++ b/libswscale/utils.c > @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > #if ARCH_X86_64 > int i, j, k; > int cpu_flags = av_get_cpu_flags(); > -// avx2 hscale filter processes 16 pixel blocks. > -if (!filter || dstW % 16 != 0) > +if (!filter) > return 0; > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > // Do not swap filterPos for pixels which won't be processed by > // the main loop. > - for (i = 0; i + 8 <= dstW; i += 8) { > + for (i = 0; i + 16 <= dstW; i += 16) { > FFSWAP(int, filterPos[i + 2], filterPos[i + 4]); > FFSWAP(int, filterPos[i + 3], filterPos[i + 5]); > + FFSWAP(int, filterPos[i + 10], filterPos[i + 12]); > + FFSWAP(int, filterPos[i + 11], filterPos[i + 13]); > } > if (filterSize > 4) { > // 16 pixels are processed at a time. > @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int > *filterPos, > } > } > } > + // 4 pixels are processed at a time in the tail. > + for (; i < dstW; i += 4) { > + // 4 filter coeffs are processed at a time. > + int rem = dstW - i >= 4 ? 4 : dstW - i; > + for (k = 0; k + 4 <= filterSize; k += 4) { > + for (j = 0; j < rem; ++j) { > + int from = (i + j) * filterSize + k; > + int to = i * filterSize + j * 4 + k * 4; > + memcpy(&filter[to], &filterCopy[from], 4 * > sizeof(int16_t)); > + } > + } > + } > } > av_free(filterCopy); > } > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c > index 628f12137c..f628c71bd4 100644 > --- a/libswscale/x86/swscale.c > +++ b/libswscale/x86/swscale.c > @@ -626,10 +626,8 @@ switch(c->dstBpc){ \ > > if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) { > if ((c->srcBpc == 8) && (c->dstBpc <= 14)) { > -if (c->chrDstW % 16 == 0) > -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > -if (c->dstW % 16 == 0) > -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize); > +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize); > } > } > > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index b643a47c30..798990a6cf 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -223,7 +223,7 @@ static void check_hscale(void) > ff_sws_init_scale(ctx); > memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS > * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH)); > if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & > AV_CPU_FLAG_SLOW_GATHER)) > -ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, SRC_PIXELS); > +ff_shuffle_filter_coefficients(ctx, filterPosAvx, > width, filterAvx2, ctx->dstW); > > if (check_func(ctx->hcScale, > "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width, > ctx->dstW)) { > memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0])); > -- > 2.37.0.170.g444d1eabd0-goog > > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
Hi Michael, I have tried to recreate this locally in a clean client applying the patches as sent in the email thread. I have tried gcc and mingw and this passes for me. Are you sure you applied both patches 3 & 4? If only patch 4 is applied, then I get the error you have. Thanks, Alan On Sat, Jul 16, 2022 at 1:14 PM Michael Niedermayer wrote: > On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote: > > Hi Michael, > > > > Thanks for looking at this. I fixed the test issue. > > seems to be still failing here: > make distclean ; ./configure && make -j32 tests/checkasm/checkasm && > tests/checkasm/checkasm --test=sw_scale > checkasm: using random seed 1328711543 > MMXEXT: > - sw_scale.yuv2yuvX [OK] > SSE2: > - sw_scale.hscale [OK] > SSE3: > - sw_scale.yuv2yuvX [OK] > SSSE3: > - sw_scale.hscale [OK] > SSE4.1: > - sw_scale.hscale [OK] > AVX2: >hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235) >hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235) > - sw_scale.hscale [FAILED] > - sw_scale.yuv2yuvX [OK] > checkasm: 12 of 504 tests have failed > > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > It is a danger to trust the dream we wish for rather than > the science we have, -- Dr. Kenneth Brown > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.
Hi Michael, Is there anything blocking this change being applied? Is there anything I can do to help? Thanks, Alan On Mon, Jul 18, 2022 at 6:49 PM Michael Niedermayer wrote: > On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote: > > Hi Michael, > > > > I have tried to recreate this locally in a clean client applying the > > patches as sent in the email thread. I have tried gcc and mingw and this > > passes for me. Are you sure you applied both patches 3 & 4? If only > patch 4 > > is applied, then I get the error you have. > > ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4 > applied > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Those who are too smart to engage in politics are punished by being > governed by those who are dumber. -- Plato > ___ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext
--- libswscale/x86/swscale.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 32d441245d..881a4b7798 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ if(pixelsProcessed > 0) \ ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ - ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ + ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ return; \ } -- 2.37.1.595.g718a3a8f04-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext
--- Call yuv2yuvX_mmxext on line 208 also. libswscale/x86/swscale.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 32d441245d..e0f90d5c58 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -205,13 +205,13 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ int remainder = (dstW % step); \ int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ -yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ +yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ if(pixelsProcessed > 0) \ ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ - ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ + ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ return; \ } -- 2.37.1.595.g718a3a8f04-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext
--- Remove yuv2yuvX_mmx as it is no longer used. libswscale/x86/swscale.c | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c index 32d441245d..89ef9f5d2b 100644 --- a/libswscale/x86/swscale.c +++ b/libswscale/x86/swscale.c @@ -205,20 +205,17 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, \ int remainder = (dstW % step); \ int pixelsProcessed = dstW - remainder; \ if(((uintptr_t)dest) & 15){ \ -yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \ +yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \ return; \ } \ if(pixelsProcessed > 0) \ ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, pixelsProcessed + offset, dither, offset); \ if(remainder > 0){ \ - ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ + ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - offset, pixelsProcessed + remainder + offset, dither, offset); \ } \ return; \ } -#if HAVE_MMX_EXTERNAL -YUV2YUVX_FUNC_MMX(mmx, 16) -#endif #if HAVE_MMXEXT_EXTERNAL YUV2YUVX_FUNC_MMX(mmxext, 16) #endif -- 2.37.1.595.g718a3a8f04-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
Re: [FFmpeg-devel] [PATCH v2] checkasm: sw_scale: Produce more realistic test filter coefficients for yuv2yuvX
Thanks Martin for doing this. On Thu, Aug 18, 2022 at 10:16 AM Martin Storsjö wrote: > This avoids triggering overflows in the filters, and avoids stray > test failures in the approximate functions on x86; due to rounding > differences, one implementation might overflow while another one > doesn't. > > Signed-off-by: Martin Storsjö > --- > FWIW, this modification runs successfully with over 1000 different > seeds in checkasm. > --- > tests/checkasm/sw_scale.c | 16 +++- > 1 file changed, 15 insertions(+), 1 deletion(-) > > diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c > index d72506ed86..ec06eafebe 100644 > --- a/tests/checkasm/sw_scale.c > +++ b/tests/checkasm/sw_scale.c > @@ -188,7 +188,6 @@ static void check_yuv2yuvX(int accurate) > uint8_t d_val = rnd(); > memset(dither, d_val, LARGEST_INPUT_SIZE); > randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * > LARGEST_INPUT_SIZE * sizeof(int16_t)); > -randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * > sizeof(int16_t)); > ctx = sws_alloc_context(); > if (accurate) > ctx->flags |= SWS_ACCURATE_RND; > @@ -202,6 +201,21 @@ static void check_yuv2yuvX(int accurate) > if (dstW <= osi) > continue; > for (fsi = 0; fsi < FILTER_SIZES; ++fsi) { > +// Generate filter coefficients for the given filter size, > +// with some properties: > +// - The coefficients add up to the intended sum (4096, > 1<<12) > +// - The coefficients contain negative values > +// - The filter intermediates don't overflow for worst > case > +// inputs (all positive coefficients are coupled with > +// input_max and all negative coefficients with > input_min, > +// or vice versa). > +// Produce a filter with all coefficients set to > +// -((1<<12)/(filter_size-1)) except for one (randomly > chosen) > +// which is set to ((1<<13)-1). > +for (i = 0; i < filter_sizes[fsi]; ++i) > +filter_coeff[i] = -((1 << 12) / (filter_sizes[fsi] - > 1)); > +filter_coeff[rnd() % filter_sizes[fsi]] = (1 << 13) - 1; > + > src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]); > vFilterData = av_malloc((filter_sizes[fsi] + 2) * > sizeof(union VFilterData)); > memset(vFilterData, 0, (filter_sizes[fsi] + 2) * > sizeof(union VFilterData)); > -- > 2.25.1 > > ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
[FFmpeg-devel] [PATCH] sws: Don't compile yuv2yuvX for mmx
--- libswscale/x86/yuv2yuvX.asm | 2 -- 1 file changed, 2 deletions(-) diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm index b6294cb919..d5b03495fd 100644 --- a/libswscale/x86/yuv2yuvX.asm +++ b/libswscale/x86/yuv2yuvX.asm @@ -124,8 +124,6 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset REP_RET %endmacro -INIT_MMX mmx -YUV2YUVX_FUNC INIT_MMX mmxext YUV2YUVX_FUNC INIT_XMM sse3 -- 2.37.2.609.g9ff673ca1a-goog ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".