from:"Alan Kelly"

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-02-09 Thread Alan Kelly

Ping!

On Thu, Jan 14, 2021 at 3:47 PM Alan Kelly  wrote:

> ---
>  Replaces cpuflag(mmx) with notcpuflag(sse3) for store macro
>  Tests for multiple sizes in checkasm-sw_scale
>  checkasm-sw_scale aligns memory on 8 bytes instad of 32 to catch aligned
> loads
>  libswscale/x86/Makefile   |   1 +
>  libswscale/x86/swscale.c  | 130 
>  libswscale/x86/swscale_template.c |  82 --
>  libswscale/x86/yuv2yuvX.asm   | 136 ++
>  tests/checkasm/sw_scale.c | 103 ++
>  5 files changed, 294 insertions(+), 158 deletions(-)
>  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 831d5359aa..bfe383364e 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
> \
> x86/scale.o  \
> x86/rgb_2_rgb.o  \
> x86/yuv_2_rgb.o  \
> +   x86/yuv2yuvX.o   \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 15c0b22f20..3df193a067 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset)
> = 0x8080808080808080ULL;
>  DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)=
> 0x0001000100010001ULL;
>
>
> +#define YUV2YUVX_FUNC_DECL(opt)  \
> +static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const
> int16_t **src, \
> +   uint8_t *dest, int dstW, \
> +   const uint8_t *dither, int offset); \
> +
> +YUV2YUVX_FUNC_DECL(mmx)
> +YUV2YUVX_FUNC_DECL(mmxext)
> +YUV2YUVX_FUNC_DECL(sse3)
> +YUV2YUVX_FUNC_DECL(avx2)
> +
>  //MMX versions
>  #if HAVE_MMX_INLINE
>  #undef RENAME
> @@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int
> dstY)
>  }
>
>  #if HAVE_MMXEXT
> -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> -   const int16_t **src, uint8_t *dest, int dstW,
> -   const uint8_t *dither, int offset)
> -{
> -if(((uintptr_t)dest) & 15){
> -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset);
> -return;
> -}
> -filterSize--;
> -#define MAIN_FUNCTION \
> -"pxor   %%xmm0, %%xmm0 \n\t" \
> -"punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -"movd   %4, %%xmm1 \n\t" \
> -"punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -"punpckldq  %%xmm1, %%xmm1 \n\t" \
> -"punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -"psllw  $3, %%xmm1 \n\t" \
> -"paddw  %%xmm1, %%xmm3 \n\t" \
> -"psraw  $4, %%xmm3 \n\t" \
> -"movdqa %%xmm3, %%xmm4 \n\t" \
> -"movdqa %%xmm3, %%xmm7 \n\t" \
> -"movl   %3, %%ecx  \n\t" \
> -"mov %0, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -".p2align 4 \n\t" /*
> FIXME Unroll? */\
> -"1: \n\t"\
> -"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /*
> filterCoeff */\
> -"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2
> \n\t" /* srcData */\
> -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5
> \n\t" /* srcData */\
> -"add$16, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -"test %%"FF_REG_S", %%"FF_REG_S"
>  \n\t"\
> -"pmulhw   %%xmm0, %%xmm2  \n\t"\
> -"pmulhw   %%xmm0, %%xmm5  \n\t"\
> -"paddw%%xmm2, %%xmm3  \n\t"\
> -"paddw%%xmm5, %%xmm4  \n\t"\
> -" jnz

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-02-16 Thread Alan Kelly

Looks like there are no comments, is this OK to be applied? Thanks

On Tue, Feb 9, 2021 at 6:25 PM Paul B Mahol  wrote:

> Will apply in no comments.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_scale.c

2021-02-19 Thread Alan Kelly

Initialises each item in src and filter arrays to fix valgrind
uninitialised value warning.
---
 tests/checkasm/sw_scale.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 7504f8b45f..a4866723d7 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -86,8 +86,10 @@ static void check_yuv2yuvX(void)
 uint16_t coeff[8];
 } *vFilterData;
 uint8_t d_val = rnd();
-randomize_buffers(filter_coeff, LARGEST_FILTER);
-randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE);
+for(i = 0; i < LARGEST_FILTER * LARGEST_INPUT_SIZE; ++i)
+  src_pixels[i] = rnd();
+for(i = 0; i < LARGEST_FILTER; ++i)
+  filter_coeff[i] = rnd();
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] tests/checkasm/sw_scale.c

2021-02-19 Thread Alan Kelly

Checks av_mallocs
---
 tests/checkasm/sw_scale.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index a4866723d7..ef414c0a82 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -103,7 +103,11 @@ static void check_yuv2yuvX(void)
 for(osi = 0; osi < 64; osi += 16){
 for(fsi = 0; fsi < FILTER_SIZES; ++fsi){
 src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]);
+if(!src)
+  fail();
 vFilterData = av_malloc((filter_sizes[fsi] + 2) * sizeof(union 
VFilterData));
+if(!vFilterData)
+  fail();
 memset(vFilterData, 0, (filter_sizes[fsi] + 2) * sizeof(union 
VFilterData));
 for(i = 0; i < filter_sizes[fsi]; ++i){
 src[i] = &src_pixels[i * LARGEST_INPUT_SIZE];
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-02-19 Thread Alan Kelly

Thanks James for spotting this. I have sent two patches fixing the valgrind
error from checkasm and the unchecked av_mallocs.

I do not believe that the two remaining valgrind errors come from my patch,
although I may be mistaken. Using git bisect, I have
identified b94cd55155d8c061f1e1faca9076afe540149c27 as the problematic
commit.

On Thu, Feb 18, 2021 at 11:23 PM James Almer  wrote:

> On 2/17/2021 5:24 PM, Paul B Mahol wrote:
> > On Tue, Feb 16, 2021 at 6:31 PM Alan Kelly <
> > alankelly-at-google@ffmpeg.org> wrote:
> >
> >> Looks like there are no comments, is this OK to be applied? Thanks
> >>
> >
> > Applied, thanks for pinging.
>
> Valgrind complains about this change. The checkasm test specifically.
>
>
> http://fate.ffmpeg.org/report.cgi?time=20210218014903&slot=x86_64-archlinux-gcc-valgrind
>
> I also noticed it has a bunch of unchecked av_mallocs().
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] tests/checkasm/sw_scale.c

2021-02-19 Thread Alan Kelly

Initialises each item in src and filter arrays to fix valgrind
uninitialised value warning.
---
 casts pointers to uint8_t* and multiplies the buffer size by sizeof(uint16_t).
 tests/checkasm/sw_scale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 7504f8b45f..e3bedd57c6 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -86,8 +86,8 @@ static void check_yuv2yuvX(void)
 uint16_t coeff[8];
 } *vFilterData;
 uint8_t d_val = rnd();
-randomize_buffers(filter_coeff, LARGEST_FILTER);
-randomize_buffers(src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE);
+randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * 
LARGEST_INPUT_SIZE * sizeof(uint16_t));
+randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * 
sizeof(uint16_t));
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

2021-02-23 Thread Alan Kelly

---
 libswscale/x86/swscale.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 1e865914cb..71961a9ae0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -206,7 +206,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+if(dstW > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
 return; \
 }
 
@@ -224,7 +225,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
+if(pixelsProcessed > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
   ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
 } \
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

2021-02-23 Thread Alan Kelly

---
 tests/checkasm/sw_scale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index a10118704b..3ac0f9082f 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -68,8 +68,8 @@ static void check_yuv2yuvX(void)
 #define FILTER_SIZES 4
 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
 #define LARGEST_INPUT_SIZE 512
-#define INPUT_SIZES 4
-static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512};
+#define INPUT_SIZES 6
+static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512};
 
 declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
   int filterSize, const int16_t **src, uint8_t *dest,
@@ -107,7 +107,7 @@ static void check_yuv2yuvX(void)
 for(j = 0; j < 4; ++j)
 vFilterData[i].coeff[j + 4] = filter_coeff[i];
 }
-if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", 
filter_sizes[fsi], osi)){
+if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", 
filter_sizes[fsi], osi, dstW)){
 memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
 memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

2021-02-23 Thread Alan Kelly

---
 This is so that tails of size 8 may safely be processed
 libswscale/x86/yuv2yuvX.asm | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 521880dabe..b6294cb919 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -37,8 +37,10 @@ SECTION .text
 cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %if notcpuflag(sse3)
 %define movr mova
+%define unroll 1
 %else
 %define movr movdqu
+%define unroll 2
 %endif
 movsxdifnidn dstWq, dstWd
 movsxdifnidn offsetq, offsetd
@@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 .outerloop:
 mova m4, m7
 mova m3, m7
+%if cpuflag(sse3)
 mova m6, m7
 mova m1, m7
+%endif
 .loop:
 %if cpuflag(avx2)
 vpbroadcastq m0, [filterSizeq + 8]
@@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
 paddwm3, m3, m2
 paddwm4, m4, m5
+%if cpuflag(sse3)
 pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
 pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
 paddwm6, m6, m2
 paddwm1, m1, m5
+%endif
 add  filterSizeq, $10
 mov  srcq, [filterSizeq]
 test srcq, srcq
 jnz  .loop
 psrawm3, m3, 3
 psrawm4, m4, 3
+%if cpuflag(sse3)
 psrawm6, m6, 3
 psrawm1, m1, 3
+%endif
 packuswb m3, m3, m4
+%if cpuflag(sse3)
 packuswb m6, m6, m1
+%endif
 mov  srcq, [filterq]
 %if cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
 movr [destq + offsetq], m3
+%if cpuflag(sse3)
 movr [destq + offsetq + mmsize], m6
-add  offsetq, mmsize * 2
+%endif
+add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
 jb  .outerloop
-- 
2.30.0.617.g56c4b15f3c-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] libswscale/x86/swscale: Only call ff_yuv2yuvX functions if the input size is > 0

2021-04-01 Thread Alan Kelly

---
 libswscale/x86/swscale.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index cc9e8b0155..0848a31461 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,7 +197,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+if(dstW > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
 return; \
 }
 
@@ -215,7 +216,8 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
+if(pixelsProcessed > 0) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
   ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
 } \
-- 
2.31.0.291.g576ba9dcdaf-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] tests/checkasm/sw_scale: adds additional tests sizes for yux2yuvX

2021-04-01 Thread Alan Kelly

---
 tests/checkasm/sw_scale.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index a10118704b..3ac0f9082f 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -68,8 +68,8 @@ static void check_yuv2yuvX(void)
 #define FILTER_SIZES 4
 static const int filter_sizes[FILTER_SIZES] = {1, 4, 8, 16};
 #define LARGEST_INPUT_SIZE 512
-#define INPUT_SIZES 4
-static const int input_sizes[INPUT_SIZES] = {128, 144, 256, 512};
+#define INPUT_SIZES 6
+static const int input_sizes[INPUT_SIZES] = {8, 24, 128, 144, 256, 512};
 
 declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *filter,
   int filterSize, const int16_t **src, uint8_t *dest,
@@ -107,7 +107,7 @@ static void check_yuv2yuvX(void)
 for(j = 0; j < 4; ++j)
 vFilterData[i].coeff[j + 4] = filter_coeff[i];
 }
-if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d", 
filter_sizes[fsi], osi)){
+if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", 
filter_sizes[fsi], osi, dstW)){
 memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
 memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-- 
2.31.0.291.g576ba9dcdaf-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/3] libswscale/x86/yuv2yuvX: Removes unrolling for mmx and mmxext

2021-04-01 Thread Alan Kelly

---
 This is so that inputs of size 8 are supported, as was the case with
 the original implementation. A bug was found with inputs not divisible
 by 16.
 libswscale/x86/yuv2yuvX.asm | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 521880dabe..b6294cb919 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -37,8 +37,10 @@ SECTION .text
 cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, dstW, dither, offset
 %if notcpuflag(sse3)
 %define movr mova
+%define unroll 1
 %else
 %define movr movdqu
+%define unroll 2
 %endif
 movsxdifnidn dstWq, dstWd
 movsxdifnidn offsetq, offsetd
@@ -70,8 +72,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 .outerloop:
 mova m4, m7
 mova m3, m7
+%if cpuflag(sse3)
 mova m6, m7
 mova m1, m7
+%endif
 .loop:
 %if cpuflag(avx2)
 vpbroadcastq m0, [filterSizeq + 8]
@@ -84,28 +88,36 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
 paddwm3, m3, m2
 paddwm4, m4, m5
+%if cpuflag(sse3)
 pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
 pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
 paddwm6, m6, m2
 paddwm1, m1, m5
+%endif
 add  filterSizeq, $10
 mov  srcq, [filterSizeq]
 test srcq, srcq
 jnz  .loop
 psrawm3, m3, 3
 psrawm4, m4, 3
+%if cpuflag(sse3)
 psrawm6, m6, 3
 psrawm1, m1, 3
+%endif
 packuswb m3, m3, m4
+%if cpuflag(sse3)
 packuswb m6, m6, m1
+%endif
 mov  srcq, [filterq]
 %if cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
 movr [destq + offsetq], m3
+%if cpuflag(sse3)
 movr [destq + offsetq + mmsize], m6
-add  offsetq, mmsize * 2
+%endif
+add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
 jb  .outerloop
-- 
2.31.0.291.g576ba9dcdaf-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction

2021-06-14 Thread Alan Kelly

Broadwell and later have fast gather instructions.
---
 This is so that the avx2 version of ff_hscale8to15X which uses gather
 instructions is only selected on machines where it will actually be
 faster.
 libavutil/cpu.c  |  6 ++
 libavutil/cpu.h  |  6 ++
 libavutil/cpu_internal.h |  1 +
 libavutil/x86/cpu.c  | 18 ++
 4 files changed, 31 insertions(+)

diff --git a/libavutil/cpu.c b/libavutil/cpu.c
index 8960415d00..0a723eeb7a 100644
--- a/libavutil/cpu.c
+++ b/libavutil/cpu.c
@@ -49,6 +49,12 @@
 
 static atomic_int cpu_flags = ATOMIC_VAR_INIT(-1);
 
+int av_cpu_has_fast_gather(void){
+if (ARCH_X86)
+return ff_cpu_has_fast_gather();
+return 0;
+}
+
 static int get_cpu_flags(void)
 {
 if (ARCH_MIPS)
diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index b555422dae..faf3a221f4 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -72,6 +72,7 @@
 #define AV_CPU_FLAG_MMI  (1 << 0)
 #define AV_CPU_FLAG_MSA  (1 << 1)
 
+int av_cpu_has_fast_gather(void);
 /**
  * Return the flags which specify extensions supported by the CPU.
  * The returned value is affected by av_force_cpu_flags() if that was used
@@ -107,6 +108,11 @@ int av_cpu_count(void);
  *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
  *  present.
  */
+
+/**
+ * Returns true if the cpu has fast gather instructions.
+ * Broadwell and later cpus have fast gather
+ */
 size_t av_cpu_max_align(void);
 
 #endif /* AVUTIL_CPU_H */
diff --git a/libavutil/cpu_internal.h b/libavutil/cpu_internal.h
index 889764320b..92525df0c1 100644
--- a/libavutil/cpu_internal.h
+++ b/libavutil/cpu_internal.h
@@ -46,6 +46,7 @@ int ff_get_cpu_flags_aarch64(void);
 int ff_get_cpu_flags_arm(void);
 int ff_get_cpu_flags_ppc(void);
 int ff_get_cpu_flags_x86(void);
+int ff_cpu_has_fast_gather(void);
 
 size_t ff_get_cpu_max_align_mips(void);
 size_t ff_get_cpu_max_align_aarch64(void);
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..9724e0017b 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -270,3 +270,21 @@ size_t ff_get_cpu_max_align_x86(void)
 
 return 8;
 }
+
+int ff_cpu_has_fast_gather(void){
+int eax, ebx, ecx;
+int max_std_level, std_caps = 0;
+int family = 0, model = 0;
+cpuid(0, max_std_level, ebx, ecx, std_caps);
+
+if (max_std_level >= 1) {
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+// Broadwell and later
+if(family == 6 && model >= 70){
+  return 1;
+}
+}
+return 0;
+}
-- 
2.32.0.272.g935e593368-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-06-14 Thread Alan Kelly

These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 +++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 ++
 tests/checkasm/sw_scale.c |  21 +--
 6 files changed, 187 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a1de95cee0..45ef657cd4 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1056,4 +1056,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn 
yuv2plane1, yuv2planarX_fn
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 6bac7b658d..0dc1f7df7f 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -267,6 +267,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST(cpu_flags) && av_cpu_has_fast_gather()){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1697,6 +1732,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1706,6 +1742,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o  \
+   x86/scale_avx2.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 00..d90fd2d791
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@
+;**

Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds av_cpu_has_fast_gather to detect cpus with avx fast gather instruction

2021-06-24 Thread Alan Kelly

Hi,

Sorry for the late reply, busy oncall week. Thanks for your responses. I
have looked at the code for cpuflags and what you suggested makes sense. I
just have a question about naming. EXTERNAL_AVX2_FAST is already used in
many places - it checks whether the flag AV_CPU_FLAG_AVXSLOW is set so I
can't use this as it would change the meaning of it. Could I define a flag
like for AV_CPU_FLAG_CMOV? AV_CPU_FLAG_FAST_GATHER or similar? Or could you
please suggest a better solution.

Thanks

On Mon, Jun 14, 2021 at 2:17 PM James Almer  wrote:

> On 6/14/2021 8:53 AM, Ronald S. Bultje wrote:
> > Hi Alan,
> >
> > On Mon, Jun 14, 2021 at 7:20 AM Alan Kelly <
> > alankelly-at-google@ffmpeg.org> wrote:
> >
> >> Broadwell and later have fast gather instructions.
> >> ---
> >>   This is so that the avx2 version of ff_hscale8to15X which uses gather
> >>   instructions is only selected on machines where it will actually be
> >>   faster.
> >>
> >
> > We've in the past typically done this with a bit in the cpuflags return
> > value. Can this be added there instead of being its own function?
> >
> > Also, what is the cycle count of ssse3/avx2 implementation for this
> > specific function on Haswell? It would be good to note that in the
> > respective patch so that we understand why the check was added.
>
> Between 9 and 12 on Haswell, 5 to 7 on Broadwell, and about 2 to 5 on
> Skylake and newer, acording to Agner's pdf if i'm reading it right. It's
> also slow on AMD before Zen 3.
>
> And yes, this should if anything be a new cpu flag and not a new function.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

2021-06-25 Thread Alan Kelly

Broadwell and later and Zen3 and later have fast gather instructions.
---
 Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell,
 and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3.
 libavutil/cpu.h |  2 ++
 libavutil/x86/cpu.c | 18 --
 libavutil/x86/cpu.h |  1 +
 3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index b555422dae..f94eb79af1 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -50,6 +50,7 @@
 #define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions
 #define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction
 #define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS 
support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVX2SLOW  0x200 ///< AVX2 supported but gather is 
slower.
 #define AV_CPU_FLAG_FMA30x1 ///< Haswell FMA3 functions
 #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
@@ -107,6 +108,7 @@ int av_cpu_count(void);
  *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
  *  present.
  */
+
 size_t av_cpu_max_align(void);
 
 #endif /* AVUTIL_CPU_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..56fcde594c 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,20 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){
 rval |= AV_CPU_FLAG_AVX2;
+
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+// Haswell and earlier has slow gather
+if(family == 6 && model < 70)
+rval |= AV_CPU_FLAG_AVX2SLOW;
+// Zen 2 and earlier
+if (!strncmp(vendor.c, "AuthenticAMD", 12) && family < 25)
+rval |= AV_CPU_FLAG_AVX2SLOW;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
@@ -194,8 +206,10 @@ int ff_get_cpu_flags_x86(void)
functions using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
-if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
+if ((family == 0x15 || family == 0x16) && (rval & 
AV_CPU_FLAG_AVX)){
 rval |= AV_CPU_FLAG_AVXSLOW;
+rval |= AV_CPU_FLAG_AVX2SLOW;
+}
 }
 
 /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
diff --git a/libavutil/x86/cpu.h b/libavutil/x86/cpu.h
index 937c697fa0..a42a15a997 100644
--- a/libavutil/x86/cpu.h
+++ b/libavutil/x86/cpu.h
@@ -78,6 +78,7 @@
 #define EXTERNAL_AVX2(flags)CPUEXT_SUFFIX(flags, _EXTERNAL, AVX2)
 #define EXTERNAL_AVX2_FAST(flags)   CPUEXT_SUFFIX_FAST2(flags, _EXTERNAL, 
AVX2, AVX)
 #define EXTERNAL_AVX2_SLOW(flags)   CPUEXT_SUFFIX_SLOW2(flags, _EXTERNAL, 
AVX2, AVX)
+#define EXTERNAL_AVX2_FAST_GATHER(flags)   CPUEXT_SUFFIX_FAST(flags, 
_EXTERNAL, AVX2)
 #define EXTERNAL_AESNI(flags)   CPUEXT_SUFFIX(flags, _EXTERNAL, AESNI)
 #define EXTERNAL_AVX512(flags)  CPUEXT_SUFFIX(flags, _EXTERNAL, AVX512)
 
-- 
2.32.0.93.g670b81a890-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-06-25 Thread Alan Kelly

These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 +++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 ++
 tests/checkasm/sw_scale.c |  21 +--
 6 files changed, 187 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index a1de95cee0..45ef657cd4 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1056,4 +1056,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn 
yuv2plane1, yuv2planarX_fn
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 6bac7b658d..07c4d2f741 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -267,6 +267,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST_GATHER(cpu_flags)){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1697,6 +1732,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1706,6 +1742,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o  \
+   x86/scale_avx2.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 00..d90fd2d791
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@
+;***

Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

2021-06-25 Thread Alan Kelly

On Fri, Jun 25, 2021 at 10:40 AM Lynne  wrote:

> Jun 25, 2021, 09:54 by alankelly-at-google@ffmpeg.org:
>
> > Broadwell and later and Zen3 and later have fast gather instructions.
> > ---
> >  Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on Broadwell,
> >  and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3.
> >  libavutil/cpu.h |  2 ++
> >  libavutil/x86/cpu.c | 18 --
> >  libavutil/x86/cpu.h |  1 +
> >  3 files changed, 19 insertions(+), 2 deletions(-)
> >
>
> No, we really don't need more FAST/SLOW flags, especially for
> something like this which is just fixable by _not_using_vgather_.
> Take a look at libavutil/x86/tx_float.asm, we only use vgather
> if it's guaranteed to either be faster for what we're gathering or
> is just as fast "slow". If neither is true, we use manual lookups,
> which is actually advantageous since for AVX2 we can interleave
> the lookups that happen in each lane.
>
> Even if we disregard this, I've extensively benchmarked vgather
> on Zen 3, Zen 2, Cascade Lake and Skylake, and there's hardly
> a great vgather improvement to be found in Zen 3 to justify
> using a new CPU flag for this.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>

Thanks for your response. I'm not against finding a cleaner way of
enabling/disabling the code which will be protected by this flag. However,
the manual lookups solution proposed will not work in this case, the avx2
version of hscale will only be faster if fast gathers are available,
otherwise, the ssse3 version should be used.

I haven't got access to a Zen3 so I can't comment on the performance. I
have tested on a Zen 2 and it is slow. On Broadwell hscale avx2 is about
10% faster than the ssse3 version and on Skylake about 40% faster, Haswell
has similar performance to Zen2.

Is there a proxy which could be used for detecting Broadwell or Skylake and
later? AVX512 seems too strict as there are Skylake chips without AVX512.
Thanks
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-06-25 Thread Alan Kelly

On Fri, Jun 25, 2021 at 1:26 PM Ronald S. Bultje  wrote:

> Hi Alan,
>
> On Fri, Jun 25, 2021 at 3:59 AM Alan Kelly <
> alankelly-at-google@ffmpeg.org> wrote:
>
>> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
>>
>
> Re-asking a question I asked before in the other thread:
>
> Also, what is the cycle count of ssse3/avx2 implementation for this
> specific function on Haswell? It would be good to note that in the
> respective patch so that we understand why the check was added.
>
> You should be able to find this in the checkasm --bench --test=X numbers
> for this relevant function.
>
> Ronald
>

Hi Ronald,

Skylake Haswell
hscale_8_to_15_width4_ssse3 761.2 760
hscale_8_to_15_width4_avx2 468.7 957
hscale_8_to_15_width8_ssse3 1170.7 1032
hscale_8_to_15_width8_avx2 865.7 1979
hscale_8_to_15_width12_ssse3 2172.2 2472
hscale_8_to_15_width12_avx2 1245.7 2901
hscale_8_to_15_width16_ssse3 2244.2 2400
hscale_8_to_15_width16_avx2 1647.2 3681

As you can see, it is catastrophic on Haswell. In the next iteration of the
patch, I will update the description with these numbers.

Thanks
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

2021-07-12 Thread Alan Kelly

On Fri, Jun 25, 2021 at 1:24 PM Alan Kelly  wrote:

> On Fri, Jun 25, 2021 at 10:40 AM Lynne  wrote:
>
>> Jun 25, 2021, 09:54 by alankelly-at-google@ffmpeg.org:
>>
>> > Broadwell and later and Zen3 and later have fast gather instructions.
>> > ---
>> >  Gather requires between 9 and 12 cycles on Haswell, 5 to 7 on
>> Broadwell,
>> >  and 2 to 5 on Skylake and newer. It is also slow on AMD before Zen 3.
>> >  libavutil/cpu.h |  2 ++
>> >  libavutil/x86/cpu.c | 18 --
>> >  libavutil/x86/cpu.h |  1 +
>> >  3 files changed, 19 insertions(+), 2 deletions(-)
>> >
>>
>> No, we really don't need more FAST/SLOW flags, especially for
>> something like this which is just fixable by _not_using_vgather_.
>> Take a look at libavutil/x86/tx_float.asm, we only use vgather
>> if it's guaranteed to either be faster for what we're gathering or
>> is just as fast "slow". If neither is true, we use manual lookups,
>> which is actually advantageous since for AVX2 we can interleave
>> the lookups that happen in each lane.
>>
>> Even if we disregard this, I've extensively benchmarked vgather
>> on Zen 3, Zen 2, Cascade Lake and Skylake, and there's hardly
>> a great vgather improvement to be found in Zen 3 to justify
>> using a new CPU flag for this.
>> ___
>> ffmpeg-devel mailing list
>> ffmpeg-devel@ffmpeg.org
>> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>>
>> To unsubscribe, visit link above, or email
>> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>>
>
> Thanks for your response. I'm not against finding a cleaner way of
> enabling/disabling the code which will be protected by this flag. However,
> the manual lookups solution proposed will not work in this case, the avx2
> version of hscale will only be faster if fast gathers are available,
> otherwise, the ssse3 version should be used.
>
> I haven't got access to a Zen3 so I can't comment on the performance. I
> have tested on a Zen 2 and it is slow. On Broadwell hscale avx2 is about
> 10% faster than the ssse3 version and on Skylake about 40% faster, Haswell
> has similar performance to Zen2.
>
> Is there a proxy which could be used for detecting Broadwell or Skylake
> and later? AVX512 seems too strict as there are Skylake chips without
> AVX512. Thanks
>

Hi,

I will paste the performance figures from the thread for the other part of
this patch here so that the justification for this flag is clearer:

Skylake Haswell
hscale_8_to_15_width4_ssse3 761.2 760
hscale_8_to_15_width4_avx2 468.7 957
hscale_8_to_15_width8_ssse3 1170.7 1032
hscale_8_to_15_width8_avx2 865.7 1979
hscale_8_to_15_width12_ssse3 2172.2 2472
hscale_8_to_15_width12_avx2 1245.7 2901
hscale_8_to_15_width16_ssse3 2244.2 2400
hscale_8_to_15_width16_avx2 1647.2 3681

As you can see, it is catastrophic on Haswell and older chips but the gains
on Skylake are impressive.
As I don't have performance figures for Zen 3, I can disable this feature
on all cpus apart from Broadwell and later as you say that there is no
worthwhile improvement on Zen3. Is this OK with you?

Thanks
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

2021-07-16 Thread Alan Kelly

Broadwell and later and Zen3 and later have fast gather instructions.
---
 Haswell is now excluded from EXTERNAL_AVX2_FAST as discussed in the
 email thread.
 libavutil/cpu.h |  1 +
 libavutil/x86/cpu.c | 11 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index c069076439..ec3073d021 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -113,6 +113,7 @@ void av_force_cpu_count(int count);
  *  av_set_cpu_flags_mask(), then this function will behave as if AVX is not
  *  present.
  */
+
 size_t av_cpu_max_align(void);
 
 #endif /* AVUTIL_CPU_H */
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..158e2170c4 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,17 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){
 rval |= AV_CPU_FLAG_AVX2;
+
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+// Haswell and earlier has slow gather
+if(family == 6 && model < 70)
+rval |= AV_CPU_FLAG_AVXSLOW;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
-- 
2.32.0.402.g57bb445576-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-07-16 Thread Alan Kelly

These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
discussed in the email thread for part 1 of this patch.

Benchmark results on Skylake and Haswell:

Skylake Haswell
hscale_8_to_15_width4_ssse3 761.2   760
hscale_8_to_15_width4_avx2  468.7   957
hscale_8_to_15_width8_ssse3 1170.7  1032
hscale_8_to_15_width8_avx2  865.7   1979
hscale_8_to_15_width12_ssse32172.2  2472
hscale_8_to_15_width12_avx2 1245.7  2901
hscale_8_to_15_width16_ssse32244.2  2400
hscale_8_to_15_width16_avx2 1647.2  3681

 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 +++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 ++
 tests/checkasm/sw_scale.c |  20 --
 6 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 673407636a..fba3dabe5b 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn 
yuv2plane1, yuv2planarX_fn
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 176fc6fd63..0577fd5490 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o

Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Adds fast gather detection.

2021-07-16 Thread Alan Kelly

On Fri, Jul 16, 2021 at 4:02 PM James Almer  wrote:

> On 7/16/2021 10:44 AM, Alan Kelly wrote:
> > Broadwell and later and Zen3 and later have fast gather instructions.
> > ---
> >   Haswell is now excluded from EXTERNAL_AVX2_FAST as discussed in the
> >   email thread.
>
> I was very explicit about this not being ok. We're not disabling all ymm
> usage for Haswell just for one or two swscale functions using gathers.
>
> Lets go with Lynne's latest suggestion and not change the flags at all
> and use gathers on Haswell, same as other arches, by looking at the
> AVX2_FAST flag.
>
> >   libavutil/cpu.h |  1 +
> >   libavutil/x86/cpu.c | 11 ++-
> >   2 files changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> > index c069076439..ec3073d021 100644
> > --- a/libavutil/cpu.h
> > +++ b/libavutil/cpu.h
> > @@ -113,6 +113,7 @@ void av_force_cpu_count(int count);
> >*  av_set_cpu_flags_mask(), then this function will behave as if AVX
> is not
> >*  present.
> >*/
> > +
> >   size_t av_cpu_max_align(void);
> >
> >   #endif /* AVUTIL_CPU_H */
> > diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> > index bcd41a50a2..158e2170c4 100644
> > --- a/libavutil/x86/cpu.c
> > +++ b/libavutil/x86/cpu.c
> > @@ -146,8 +146,17 @@ int ff_get_cpu_flags_x86(void)
> >   if (max_std_level >= 7) {
> >   cpuid(7, eax, ebx, ecx, edx);
> >   #if HAVE_AVX2
> > -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
> > +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)){
> >   rval |= AV_CPU_FLAG_AVX2;
> > +
> > +cpuid(1, eax, ebx, ecx, std_caps);
> > +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> > +model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> > +// Haswell and earlier has slow gather
> > +if(family == 6 && model < 70)
> > +rval |= AV_CPU_FLAG_AVXSLOW;
> > +}
> > +
> >   #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
> >   if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> >   if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) ==
> 0xd003)
> >
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>

OK, apologies for the misunderstanding. In that case part 1 of this patch
is not required. Part two remains valid with the function protected by
EXTERNAL_AVX2_FAST. Should part 2 be re-submitted as a standalone patch or
is it OK as is?
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-07-21 Thread Alan Kelly

On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly  wrote:

> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
> ---
> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
> discussed in the email thread for part 1 of this patch.
>
> Benchmark results on Skylake and Haswell:
>
> Skylake Haswell
> hscale_8_to_15_width4_ssse3 761.2   760
> hscale_8_to_15_width4_avx2  468.7   957
> hscale_8_to_15_width8_ssse3 1170.7  1032
> hscale_8_to_15_width8_avx2  865.7   1979
> hscale_8_to_15_width12_ssse32172.2  2472
> hscale_8_to_15_width12_avx2 1245.7  2901
> hscale_8_to_15_width16_ssse32244.2  2400
> hscale_8_to_15_width16_avx2 1647.2  3681
>
>  libswscale/swscale_internal.h |   2 +
>  libswscale/utils.c|  37 +++
>  libswscale/x86/Makefile   |   1 +
>  libswscale/x86/scale_avx2.asm | 112 ++
>  libswscale/x86/swscale.c  |  19 ++
>  tests/checkasm/sw_scale.c |  20 --
>  6 files changed, 186 insertions(+), 5 deletions(-)
>  create mode 100644 libswscale/x86/scale_avx2.asm
>
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 673407636a..fba3dabe5b 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c,
> yuv2planar1_fn yuv2plane1, yuv2planarX_fn
>  //number of extra lines to process
>  #define MAX_LINES_AHEAD 4
>
> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
>  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index 176fc6fd63..0577fd5490 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = {
>  [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
>  };
>
> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
> filterSize, int16_t *filter, int dstW){
> +#if ARCH_X86_64
> +int i, j, k, l;
> +int cpu_flags = av_get_cpu_flags();
> +if (EXTERNAL_AVX2_FAST(cpu_flags)){
> +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> +if (dstW % 16 == 0){
> +if (filter != NULL){
> +for (i = 0; i < dstW; i += 8){
> +FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
> +FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
> +}
> +if (filterSize > 4){
> +int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
> +memcpy(tmp2, filter, dstW * filterSize * 2);
> +for (i = 0; i < dstW; i += 16){//pixel
> +for (k = 0; k < filterSize / 4; ++k){//fcoeff
> +for (j = 0; j < 16; ++j){//inner pixel
> +for (l = 0; l < 4; ++l){//coeff
> +int from = i * filterSize + j *
> filterSize + k * 4 + l;
> +int to = (i) * filterSize + j * 4
> + l + k * 64;
> +filter[to] = tmp2[from];
> +}
> +}
> +}
> +}
> +av_free(tmp2);
> +}
> +}
> +}
> +}
> +}
> +#endif
> +}
> +
>  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
>  {
>  return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
> get_local_pos(c, 0, 0, 0),
> get_local_pos(c, 0, 0, 0))) < 0)
>  goto fail;
> +ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
> c->hLumFilterSize, c->hLumFilter, dstW);
>  if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
> &c->hChrFilterSize, c->chrXInc,
> c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
> @@ -1708,6 +1744,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
> get_local_pos(c, c->chrSrcHSubSample,
> c->src_h_chr_pos, 0),
> get_local_pos(c, c->chrDstHSubSample,
>

Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-07-26 Thread Alan Kelly

On Wed, Jul 21, 2021 at 11:11 AM Alan Kelly  wrote:

>
>
> On Fri, Jul 16, 2021 at 3:48 PM Alan Kelly  wrote:
>
>> These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
>> ---
>> EXTERNAL_AVX2_FAST is now used instead of EXTERNAL_AVX2_FAST_GATHER as
>> discussed in the email thread for part 1 of this patch.
>>
>> Benchmark results on Skylake and Haswell:
>>
>> Skylake Haswell
>> hscale_8_to_15_width4_ssse3 761.2   760
>> hscale_8_to_15_width4_avx2  468.7   957
>> hscale_8_to_15_width8_ssse3 1170.7  1032
>> hscale_8_to_15_width8_avx2  865.7   1979
>> hscale_8_to_15_width12_ssse32172.2  2472
>> hscale_8_to_15_width12_avx2 1245.7  2901
>> hscale_8_to_15_width16_ssse32244.2  2400
>> hscale_8_to_15_width16_avx2 1647.2  3681
>>
>>  libswscale/swscale_internal.h |   2 +
>>  libswscale/utils.c|  37 +++
>>  libswscale/x86/Makefile   |   1 +
>>  libswscale/x86/scale_avx2.asm | 112 ++
>>  libswscale/x86/swscale.c  |  19 ++
>>  tests/checkasm/sw_scale.c |  20 --
>>  6 files changed, 186 insertions(+), 5 deletions(-)
>>  create mode 100644 libswscale/x86/scale_avx2.asm
>>
>> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
>> index 673407636a..fba3dabe5b 100644
>> --- a/libswscale/swscale_internal.h
>> +++ b/libswscale/swscale_internal.h
>> @@ -1064,4 +1064,6 @@ void ff_init_vscale_pfn(SwsContext *c,
>> yuv2planar1_fn yuv2plane1, yuv2planarX_fn
>>  //number of extra lines to process
>>  #define MAX_LINES_AHEAD 4
>>
>> +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
>> +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
>> filterSize, int16_t *filter, int dstW);
>>  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
>> diff --git a/libswscale/utils.c b/libswscale/utils.c
>> index 176fc6fd63..0577fd5490 100644
>> --- a/libswscale/utils.c
>> +++ b/libswscale/utils.c
>> @@ -268,6 +268,41 @@ static const FormatEntry format_entries[] = {
>>  [AV_PIX_FMT_X2RGB10LE]   = { 1, 1 },
>>  };
>>
>> +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
>> filterSize, int16_t *filter, int dstW){
>> +#if ARCH_X86_64
>> +int i, j, k, l;
>> +int cpu_flags = av_get_cpu_flags();
>> +if (EXTERNAL_AVX2_FAST(cpu_flags)){
>> +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
>> +if (dstW % 16 == 0){
>> +if (filter != NULL){
>> +for (i = 0; i < dstW; i += 8){
>> +FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
>> +FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
>> +}
>> +if (filterSize > 4){
>> +int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
>> +memcpy(tmp2, filter, dstW * filterSize * 2);
>> +for (i = 0; i < dstW; i += 16){//pixel
>> +for (k = 0; k < filterSize / 4; ++k){//fcoeff
>> +for (j = 0; j < 16; ++j){//inner pixel
>> +for (l = 0; l < 4; ++l){//coeff
>> +int from = i * filterSize + j *
>> filterSize + k * 4 + l;
>> +int to = (i) * filterSize + j *
>> 4 + l + k * 64;
>> +filter[to] = tmp2[from];
>> +}
>> +}
>> +}
>> +}
>> +av_free(tmp2);
>> +}
>> +}
>> +}
>> +}
>> +}
>> +#endif
>> +}
>> +
>>  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
>>  {
>>  return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
>> @@ -1699,6 +1734,7 @@ av_cold int sws_init_context(SwsContext *c,
>> SwsFilter *srcFilter,
>> get_local_pos(c, 0, 0, 0),
>> get_local_pos(c, 0, 0, 0))) < 0)
>>  goto fail;
>> +ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
>> c->hLumFilterSize, c->hLumFilter, dstW);
>>  if ((ret = initFilter(&c->hChrFilter, &c->hChrF

[FFmpeg-devel] [PATCH] Unrolls main loop of yuv2yuvX_sse3 and general code tidying for ~20% speedup

2020-09-15 Thread Alan Kelly

---
 libswscale/x86/swscale.c | 138 ---
 1 file changed, 72 insertions(+), 66 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..e47fee2bbd 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -201,76 +201,82 @@ static void yuv2yuvX_sse3(const int16_t *filter, int 
filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
-if(((uintptr_t)dest) & 15){
+if(((uintptr_t)dest) & 31){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
 filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-}
+__asm__ volatile(
+"vmovq%5, %%xmm3\n\t"
+"cmpl $0, %3\n\t"
+"jz   2f\n\t"
+
+"# offset != 0 path.\n\t"
+"vpsrlq  $24, %%xmm3, %%xmm5\n\t"
+"vpsllq  $40, %%xmm3, %%xmm3\n\t"
+"vpor %%xmm3, %%xmm5, %%xmm3\n\t"
+
+"2: \n\t"
+"vpxor%%xmm0, %%xmm0, %%xmm0\n\t"
+"mov(%0), %%"FF_REG_S"  \n\t"
+"vpunpc

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, although local tests show a significant speed-up

2020-10-22 Thread Alan Kelly

Other functions to be ported to avx2 have been identified and are on
the todo list.
---
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  72 +++--
 libswscale/x86/yuv2yuvX.asm | 105 
 3 files changed, 112 insertions(+), 66 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..ea83b097ca 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,6 +197,10 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
@@ -205,72 +209,8 @@ static void yuv2yuvX_sse3(const int16_t *filter, int 
filterSize,
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, however, although local tests show a significant

2020-10-23 Thread Alan Kelly

 Fixed. The wrong step size was used causing a write passed the end of
 the buffer. yuv2yuvX_mmxext is now called if there are any remaining pixels.
---
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 --
 libswscale/x86/yuv2yuvX.asm | 105 
 3 files changed, 116 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..fec9fa22e0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, although local tests show a significant spee

2020-10-23 Thread Alan Kelly

, %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m"
(offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
"%xmm4" , "%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m"
(offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
"%xmm4" , "%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
+ff_yuv2yuvX_sse3(filter, filterSize - 1, dest - offset,
pixelsProcessed + offset, dither, offset);
+if(remainder > 0){
+  yuv2yuvX_mmxext(filter, filterSize, src, dest + pixelsProcessed,
remainder, dither, offset + pixelsProcessed);
 }
+return;
 }
 #endif

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
new file mode 100644
index 00..84727de599
--- /dev/null
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -0,0 +1,105 @@
+;**
+;* x86-optimized yuv2yuvX
+;* Copyright 2020 Google LLC
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
02110-1301 USA
+;**
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION .text
+
+;-
+; yuv2yuvX
+;
+; void ff_yuv2yuvX_(const int16_t *filter, int filterSize,
+;uint8_t *dest, int dstW,
+;const uint8_t *dither, int offset);
+;
+;-
+
+%macro YUV2YUVX_FUNC 0
+cglobal yuv2yuvX, 6, 7, 16, filter, rsi, dest, dstW, dither, offset, src
+%if ARCH_X86_64
+movsxd   dstWq, dstWd
+movsxd   offsetq, offsetd
+%endif ; x86-64
+movq xmm3, [ditherq]
+cmp  offsetd, 0
+jz   .offset
+
+; offset != 0 path.
+psrlqm5, m3, $18
+psllqm3, m3, $28
+por  m3, m3, m5
+
+.offset:
+%if cpuflag(avx2)
+vperm2i128   m3, m3, m3, 0
+%endif ; avx2
+%if ARCH_X86_64
+movq xmm1, rsiq
+%else
+movd mm1, rsi
+%endif
+vpbroadcastw m1, xmm1
+pxor m0, m0, m0
+mov  rsiq, filterq
+mov  srcq, [rsiq]
+punpcklbwm3, m0
+psllwm1, m1, 3
+paddwm3, m3, m1
+psrawm7, m3, 4
+.outerloop:
+mova m4, m7
+mova m3, m7
+mova m6, m7
+mova m1, m7
+.loop:
+vpbroadcastq m0, [rsiq + 8]
+pmulhw   m2, m0, [srcq + offsetq * 2]
+pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
+paddwm3, m3, m2
+paddwm4, m4, m5
+pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
+pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
+    paddw    m6,

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-10-27 Thread Alan Kelly

---
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 --
 libswscale/x86/yuv2yuvX.asm | 105 
 3 files changed, 116 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..fec9fa22e0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g"

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup. AVX2 version is ready and tested, however, although local tests show a signifi

2020-10-27 Thread Alan Kelly

Thanks for the review, I have made the required changes. As I have changed
the subject the patch is in a new thread.

On Fri, Oct 23, 2020 at 4:10 PM James Almer  wrote:

> On 10/23/2020 10:17 AM, Alan Kelly wrote:
> >  Fixed. The wrong step size was used causing a write passed the end of
> >  the buffer. yuv2yuvX_mmxext is now called if there are any remaining
> pixels.
>
> Please fix the commit subject (It's too long and contains commentary),
> and keep comments about fixes between versions outside of the commit
> message body. You can manually place them after the --- below, or in a
> separate reply.
>
> > ---
> >  libswscale/x86/Makefile |   1 +
> >  libswscale/x86/swscale.c|  75 --
> >  libswscale/x86/yuv2yuvX.asm | 105 
> >  3 files changed, 116 insertions(+), 65 deletions(-)
> >  create mode 100644 libswscale/x86/yuv2yuvX.asm
> >
> > diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> > index 831d5359aa..bfe383364e 100644
> > --- a/libswscale/x86/Makefile
> > +++ b/libswscale/x86/Makefile
> > @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
>   \
> > x86/scale.o
> \
> > x86/rgb_2_rgb.o
> \
> > x86/yuv_2_rgb.o
> \
> > +   x86/yuv2yuvX.o
>  \
> > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> > index 3160fedf04..fec9fa22e0 100644
> > --- a/libswscale/x86/swscale.c
> > +++ b/libswscale/x86/swscale.c
> > @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int
> dstY)
> >  }
> >
> >  #if HAVE_MMXEXT
> > +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> > +   uint8_t *dest, int dstW,
> > +   const uint8_t *dither, int offset);
> > +
> >  static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> > const int16_t **src, uint8_t *dest, int dstW,
> > const uint8_t *dither, int offset)
> >  {
> > +int remainder = (dstW % 32);
> > +int pixelsProcessed = dstW - remainder;
> >  if(((uintptr_t)dest) & 15){
> >  yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset);
> >  return;
> >  }
> > -filterSize--;
> > -#define MAIN_FUNCTION \
> > -"pxor   %%xmm0, %%xmm0 \n\t" \
> > -"punpcklbw  %%xmm0, %%xmm3 \n\t" \
> > -"movd   %4, %%xmm1 \n\t" \
> > -"punpcklwd  %%xmm1, %%xmm1 \n\t" \
> > -"punpckldq  %%xmm1, %%xmm1 \n\t" \
> > -"punpcklqdq %%xmm1, %%xmm1 \n\t" \
> > -"psllw  $3, %%xmm1 \n\t" \
> > -"paddw  %%xmm1, %%xmm3 \n\t" \
> > -"psraw  $4, %%xmm3 \n\t" \
> > -"movdqa %%xmm3, %%xmm4 \n\t" \
> > -"movdqa %%xmm3, %%xmm7 \n\t" \
> > -"movl   %3, %%ecx  \n\t" \
> > -"mov %0, %%"FF_REG_d"
> \n\t"\
> > -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> > -".p2align 4 \n\t" /*
> FIXME Unroll? */\
> > -"1: \n\t"\
> > -"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /*
> filterCoeff */\
> > -"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2
> \n\t" /* srcData */\
> > -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5
> \n\t" /* srcData */\
> > -"add$16, %%"FF_REG_d"
> \n\t"\
> > -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> > -"test %%"FF_REG_S", %%"FF_REG_S"
>  \n\t"\
> > -"pmulhw   %%xmm0, %%xmm2  \n\t"\
> > -"pmulhw   %%xmm0, %%xmm5  \n\t"\
> > -"paddw%%xmm2, %%xmm3  \n\t"\
> > -"paddw

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-10-27 Thread Alan Kelly

Apologies for the multiple threads, my git send-email was wrongly
configured. This has been fixed.

This code has been tested on AVX2 giving a significant speedup, however,
until the ff_hscale* functions are ported to avx2, this should not be
enabled as it results in an overall slowdown of swscale probably due to cpu
frequency scaling.

checkasm will follow in a separate patch.

On Tue, Oct 27, 2020 at 9:56 AM Alan Kelly  wrote:

> ---
>  libswscale/x86/Makefile |   1 +
>  libswscale/x86/swscale.c|  75 --
>  libswscale/x86/yuv2yuvX.asm | 105 
>  3 files changed, 116 insertions(+), 65 deletions(-)
>  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 831d5359aa..bfe383364e 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
> \
> x86/scale.o  \
> x86/rgb_2_rgb.o  \
> x86/yuv_2_rgb.o  \
> +   x86/yuv2yuvX.o   \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..fec9fa22e0 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int
> dstY)
>  }
>
>  #if HAVE_MMXEXT
> +void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> +   uint8_t *dest, int dstW,
> +   const uint8_t *dither, int offset);
> +
>  static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> const int16_t **src, uint8_t *dest, int dstW,
> const uint8_t *dither, int offset)
>  {
> +int remainder = (dstW % 32);
> +int pixelsProcessed = dstW - remainder;
>  if(((uintptr_t)dest) & 15){
>  yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset);
>  return;
>  }
> -filterSize--;
> -#define MAIN_FUNCTION \
> -"pxor   %%xmm0, %%xmm0 \n\t" \
> -"punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -"movd   %4, %%xmm1 \n\t" \
> -"punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -"punpckldq  %%xmm1, %%xmm1 \n\t" \
> -"punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -"psllw  $3, %%xmm1 \n\t" \
> -"paddw  %%xmm1, %%xmm3 \n\t" \
> -"psraw  $4, %%xmm3 \n\t" \
> -"movdqa %%xmm3, %%xmm4 \n\t" \
> -"movdqa %%xmm3, %%xmm7 \n\t" \
> -"movl   %3, %%ecx  \n\t" \
> -"mov %0, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -".p2align 4 \n\t" /*
> FIXME Unroll? */\
> -"1: \n\t"\
> -"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /*
> filterCoeff */\
> -"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2
> \n\t" /* srcData */\
> -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5
> \n\t" /* srcData */\
> -"add$16, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -"test %%"FF_REG_S", %%"FF_REG_S"
>  \n\t"\
> -"pmulhw   %%xmm0, %%xmm2  \n\t"\
> -"pmulhw   %%xmm0, %%xmm5  \n\t"\
> -"paddw%%xmm2, %%xmm3  \n\t"\
> -"paddw%%xmm5, %%xmm4  \n\t"\
> -" jnz1b \n\t"\
> -"psraw   $3, %%xmm3  \n\t"\
> -"psraw   $3, %%xmm4  \n\t"\
> -"packuswb %%xmm4, %%xmm3  \n\t"\
> -"movntdq  %%xmm3, (%1, %%"FF_REG_c")
> \n\t"

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-10-27 Thread Alan Kelly

---
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 -
 libswscale/x86/yuv2yuvX.asm | 109 
 3 files changed, 120 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..fec9fa22e0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-10-27 Thread Alan Kelly

Thanks for the feedback Anton.

The second patch incorporates changes suggested by James Almer:
avx2 instructions are wrapped in if cpuflag(avx2) and movddup restored
mm1 is replaced by m1 on x86_32



On Tue, Oct 27, 2020 at 10:40 AM Anton Khirnov  wrote:

> Hi,
> Quoting Alan Kelly (2020-10-27 10:10:14)
> > ---
> >  libswscale/x86/Makefile |   1 +
> >  libswscale/x86/swscale.c|  75 -
> >  libswscale/x86/yuv2yuvX.asm | 109 
> >  3 files changed, 120 insertions(+), 65 deletions(-)
> >  create mode 100644 libswscale/x86/yuv2yuvX.asm
> >
>
> No comments on the code itself (yet?), but as for your submission:
> - when you send multiple iterations of the same patch, it is helpful to
>   mention what changed, e.g. with git send-email --annotate
> - the commit message should follow the standard format of:
> * swscale: short summary of the change
>
>   Extended description of the commit, if needed.
>
> --
> Anton Khirnov
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-11-06 Thread Alan Kelly

The function was re-written in asm, this code is heavily derived from the
original code, the algorithm remains unchanged, the implementation is
optimized. Would you agree to adding the copyright from swscale.c:
* Copyright (C) 2001-2011 Michael Niedermayer 
to this file, having both copyrights?  Thank you.

On Sat, Oct 31, 2020 at 1:02 PM Carl Eugen Hoyos  wrote:

> Am Di., 27. Okt. 2020 um 09:56 Uhr schrieb Alan Kelly
> :
>
> > --- /dev/null
> > +++ b/libswscale/x86/yuv2yuvX.asm
> > @@ -0,0 +1,105 @@
> >
> +;**
> > +;* x86-optimized yuv2yuvX
> > +;* Copyright 2020 Google LLC
>
> Either the commit message ("move a function") or this
> copyright statement is wrong, please fix this.
>
> Please do not commit as-is...
>
> Carl Eugen
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-11-10 Thread Alan Kelly

---
 yuv2yuvX.asm: Ports yuv2yuvX to asm, unrolls main loop and adds
 other small optimizations for ~20% speed-up. Copyright updated to
 include the original from swscale.c
 swscale.c: Removes yuv2yuvX_sse3 and calls new function ff_yuv2yuvX_sse3.
 Calls yuv2yuvX_mmxext on remainining elements if required.
 Makefile: Compiles yuv2yuvX.asm

 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 
 libswscale/x86/yuv2yuvX.asm | 110 
 3 files changed, 121 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..fec9fa22e0 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, int filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-11-12 Thread Alan Kelly

---
 It now works on x86-32
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 
 libswscale/x86/yuv2yuvX.asm | 110 
 3 files changed, 121 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..758c8e540f 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-11-16 Thread Alan Kelly

---
 Fixes bug in sse3 path where m1 is not set correctly resulting in off
 by one errors. The results are now bit by bit identical.
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 
 libswscale/x86/yuv2yuvX.asm | 114 
 3 files changed, 125 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..758c8e540f 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} el

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-11-19 Thread Alan Kelly

---
 All of Henrik's suggestions have been implemented. Additionally,
 m3 and m6 are permuted in avx2 before storing to ensure bit by bit
 identical results in avx2.
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c|  75 +++
 libswscale/x86/yuv2yuvX.asm | 118 
 3 files changed, 129 insertions(+), 65 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..758c8e540f 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
+void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize,
+   uint8_t *dest, int dstW,
+   const uint8_t *dither, int offset);
+
 static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
 {
+int remainder = (dstW % 32);
+int pixelsProcessed = dstW - remainder;
 if(((uintptr_t)dest) & 15){
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
 return;
 }
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_R

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-12-01 Thread Alan Kelly

Ping

On Thu, Nov 19, 2020 at 9:42 AM Alan Kelly  wrote:

> ---
>  All of Henrik's suggestions have been implemented. Additionally,
>  m3 and m6 are permuted in avx2 before storing to ensure bit by bit
>  identical results in avx2.
>  libswscale/x86/Makefile |   1 +
>  libswscale/x86/swscale.c|  75 +++
>  libswscale/x86/yuv2yuvX.asm | 118 
>  3 files changed, 129 insertions(+), 65 deletions(-)
>  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 831d5359aa..bfe383364e 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
> \
> x86/scale.o  \
> x86/rgb_2_rgb.o  \
> x86/yuv_2_rgb.o  \
> +   x86/yuv2yuvX.o   \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..758c8e540f 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -197,80 +197,25 @@ void ff_updateMMXDitherTables(SwsContext *c, int
> dstY)
>  }
>
>  #if HAVE_MMXEXT
> +void ff_yuv2yuvX_sse3(const int16_t *filter, long filterSize,
> +   uint8_t *dest, int dstW,
> +   const uint8_t *dither, int offset);
> +
>  static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> const int16_t **src, uint8_t *dest, int dstW,
> const uint8_t *dither, int offset)
>  {
> +int remainder = (dstW % 32);
> +int pixelsProcessed = dstW - remainder;
>  if(((uintptr_t)dest) & 15){
>  yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset);
>  return;
>  }
> -filterSize--;
> -#define MAIN_FUNCTION \
> -"pxor   %%xmm0, %%xmm0 \n\t" \
> -"punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -"movd   %4, %%xmm1 \n\t" \
> -"punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -"punpckldq  %%xmm1, %%xmm1 \n\t" \
> -"punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -"psllw  $3, %%xmm1 \n\t" \
> -"paddw  %%xmm1, %%xmm3 \n\t" \
> -"psraw  $4, %%xmm3 \n\t" \
> -"movdqa %%xmm3, %%xmm4 \n\t" \
> -"movdqa %%xmm3, %%xmm7 \n\t" \
> -"movl   %3, %%ecx  \n\t" \
> -"mov %0, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -".p2align 4 \n\t" /*
> FIXME Unroll? */\
> -"1: \n\t"\
> -"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /*
> filterCoeff */\
> -"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2
> \n\t" /* srcData */\
> -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5
> \n\t" /* srcData */\
> -"add$16, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -"test %%"FF_REG_S", %%"FF_REG_S"
>  \n\t"\
> -"pmulhw   %%xmm0, %%xmm2  \n\t"\
> -"pmulhw   %%xmm0, %%xmm5  \n\t"\
> -"paddw%%xmm2, %%xmm3  \n\t"\
> -"paddw%%xmm5, %%xmm4  \n\t"\
> -" jnz1b \n\t"\
> -"psraw   $3, %%xmm3  \n\t"\
> -"psraw   $3, %%xmm4  \n\t"\
> -"packuswb %%xmm4, %%xmm3  \n\t"\
> -"movntdq  %%xmm3, (%1, %%"FF_REG_c")
> \n\t"\
> -"add $16, %%"FF_REG_c"\n\t"\
> -"cmp  %2, %%"FF_REG_c"\n\t"\
> -"mo

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-12-09 Thread Alan Kelly

---
 Activates avx2 version of yuv2yuvX
 Adds checkasm for yuv2yuvX
 Modifies ff_yuv2yuvX_* signature to match yuv2yuvX_*
 Replaces non-temporal stores with temporal stores
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c| 106 +---
 libswscale/x86/yuv2yuvX.asm | 118 
 tests/checkasm/sw_scale.c   | 101 +-
 4 files changed, 249 insertions(+), 77 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..8cd8713705 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-12-09 Thread Alan Kelly

This function is tested by fate-filter-fps-r. I have also added a checkasm
test and bench.

I have done a lot more testing and benching of this code and I am now happy
to activate the avx2 version because the performance is so good. On my
machine I get the following results for filter size 4 and 0 offset. For all
other sizes/offsets the results are similar:

yuv2yuvX_4_0_mmx:
1567.2 1563.1

yuv2yuvX_4_0_mmxext:
1560.7 1560.1

yuv2yuvX_4_0_sse3:
780.7 572.1 -26.7%

yuv2yuvX_4_0_avx2:
n/a 341.1 -56.3%

Interestingly I discovered that the non-temporal store movntdq results in a
very large variability in the test results, in many cases it significantly
increases the execution time. I have replaced these stores with aligned
stores which stabilised the runtimes. However, I am aware that
benchmarks often don't represent reality and these non-temporal stores were
probably used for a good reason. If you think it better to use NT stores, I
will replace them.

On Fri, Dec 4, 2020 at 2:00 PM Anton Khirnov  wrote:

> Quoting Alan Kelly (2020-11-19 09:41:56)
> > ---
> >  All of Henrik's suggestions have been implemented. Additionally,
> >  m3 and m6 are permuted in avx2 before storing to ensure bit by bit
> >  identical results in avx2.
> >  libswscale/x86/Makefile |   1 +
> >  libswscale/x86/swscale.c|  75 +++
> >  libswscale/x86/yuv2yuvX.asm | 118 
> >  3 files changed, 129 insertions(+), 65 deletions(-)
> >  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> Is this function tested by FATE?
> I did some brief testing and apparently it gets called during
> fate-filter-shuffleplanes-dup-luma, but the results do not change even
> if I comment out the whole function.
>
> Also, it seems like you are adding an AVX2 version of the function, but
> I don't see it being used.
>
> --
> Anton Khirnov
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-12-10 Thread Alan Kelly

---
 Replaces ff_sws_init_swscale_x86 with ff_getSwsFunc
 Load offset if not gprsize but 8 on both 32 and 64 bit
 Removes sfence as NT store no longer used
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c| 106 +---
 libswscale/x86/yuv2yuvX.asm | 117 
 tests/checkasm/sw_scale.c   | 101 ++-
 4 files changed, 248 insertions(+), 77 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..8cd8713705 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2020-12-17 Thread Alan Kelly

---
 Fixes memory alignment problem in checkasm-sw_scale
 Tested on Linux 32 and 64 bit and mingw32
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c| 106 +---
 libswscale/x86/yuv2yuvX.asm | 117 
 tests/checkasm/sw_scale.c   |  98 ++
 4 files changed, 246 insertions(+), 76 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..8cd8713705 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-05 Thread Alan Kelly

Ping!

On Thu, Dec 17, 2020 at 11:42 AM Alan Kelly  wrote:

> ---
>  Fixes memory alignment problem in checkasm-sw_scale
>  Tested on Linux 32 and 64 bit and mingw32
>  libswscale/x86/Makefile |   1 +
>  libswscale/x86/swscale.c| 106 +---
>  libswscale/x86/yuv2yuvX.asm | 117 
>  tests/checkasm/sw_scale.c   |  98 ++
>  4 files changed, 246 insertions(+), 76 deletions(-)
>  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
> index 831d5359aa..bfe383364e 100644
> --- a/libswscale/x86/Makefile
> +++ b/libswscale/x86/Makefile
> @@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
> \
> x86/scale.o  \
> x86/rgb_2_rgb.o  \
> x86/yuv_2_rgb.o  \
> +   x86/yuv2yuvX.o   \
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 3160fedf04..8cd8713705 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int
> dstY)
>  }
>
>  #if HAVE_MMXEXT
> -static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
> -   const int16_t **src, uint8_t *dest, int dstW,
> -   const uint8_t *dither, int offset)
> -{
> -if(((uintptr_t)dest) & 15){
> -yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset);
> -return;
> -}
> -filterSize--;
> -#define MAIN_FUNCTION \
> -"pxor   %%xmm0, %%xmm0 \n\t" \
> -"punpcklbw  %%xmm0, %%xmm3 \n\t" \
> -"movd   %4, %%xmm1 \n\t" \
> -"punpcklwd  %%xmm1, %%xmm1 \n\t" \
> -"punpckldq  %%xmm1, %%xmm1 \n\t" \
> -"punpcklqdq %%xmm1, %%xmm1 \n\t" \
> -"psllw  $3, %%xmm1 \n\t" \
> -"paddw  %%xmm1, %%xmm3 \n\t" \
> -"psraw  $4, %%xmm3 \n\t" \
> -"movdqa %%xmm3, %%xmm4 \n\t" \
> -"movdqa %%xmm3, %%xmm7 \n\t" \
> -"movl   %3, %%ecx  \n\t" \
> -"mov %0, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -".p2align 4 \n\t" /*
> FIXME Unroll? */\
> -"1: \n\t"\
> -"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /*
> filterCoeff */\
> -"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2
> \n\t" /* srcData */\
> -"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5
> \n\t" /* srcData */\
> -"add$16, %%"FF_REG_d"
> \n\t"\
> -"mov(%%"FF_REG_d"), %%"FF_REG_S"
>  \n\t"\
> -"test %%"FF_REG_S", %%"FF_REG_S"
>  \n\t"\
> -"pmulhw   %%xmm0, %%xmm2  \n\t"\
> -"pmulhw   %%xmm0, %%xmm5  \n\t"\
> -"paddw%%xmm2, %%xmm3  \n\t"\
> -"paddw%%xmm5, %%xmm4  \n\t"\
> -" jnz1b \n\t"\
> -"psraw   $3, %%xmm3  \n\t"\
> -"psraw   $3, %%xmm4  \n\t"\
> -"packuswb %%xmm4, %%xmm3  \n\t"\
> -"movntdq  %%xmm3, (%1, %%"FF_REG_c")
> \n\t"\
> -"add $16, %%"FF_REG_c"\n\t"\
> -"cmp  %2, %%"FF_REG_c"\n\t"\
> -"movdqa   %%xmm7, %%xmm3\n\t" \
> -"movdqa   %%xmm7, %%xmm4\n\t" \
> -"mov %0, %%"FF_REG_d"
> \n\t"\
> -&q

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-07 Thread Alan Kelly

---
 Replaces mova with movdqu due to alignment issues
 libswscale/x86/Makefile |   1 +
 libswscale/x86/swscale.c| 106 +---
 libswscale/x86/yuv2yuvX.asm | 117 
 tests/checkasm/sw_scale.c   |  98 ++
 4 files changed, 246 insertions(+), 76 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 3160fedf04..8cd8713705 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -197,81 +197,30 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3, %%xmm4  \n\t"
-"psrlq$24, %%xmm3  \n\t"
-"psllq$40, %%xmm4  \n\t"
-"por   %%xmm4, %%xmm3  \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *) dither)[0])
-  : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , 
"%xmm5" , "%xmm7" ,)
-"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c
-  );
-} else {
-__asm__ volatile(
-"movq  %5, %%xmm3   \n\t"
-MAIN_FUNCTION
-  :: "g" (filter),
-  "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
-  "m"(filterSize), "m"(((uint64_t *)

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-07 Thread Alan Kelly

Thanks for your patience with this, I have replaced mova with movdqu - movu
generated a compile error on ssse3. What system did this crash on?

On Wed, Jan 6, 2021 at 9:10 PM Michael Niedermayer 
wrote:

> On Tue, Jan 05, 2021 at 01:31:25PM +0100, Alan Kelly wrote:
> > Ping!
>
> crashes (due to alignment i think)
>
> (gdb) disassemble $rip-32,$rip+32
> Dump of assembler code from 0x555730a1 to 0x555730e1:
>0x555730a1 :   int$0x71
>0x555730a3 :   out%al,$0x3
>0x555730a5 :   vpsraw $0x3,%ymm1,%ymm1
>0x555730aa :   vpackuswb %ymm4,%ymm3,%ymm3
>0x555730ae :   vpackuswb %ymm1,%ymm6,%ymm6
>0x555730b2 :   mov(%rdi),%rdx
>0x555730b5 :   vpermq $0xd8,%ymm3,%ymm3
>0x555730bb :   vpermq $0xd8,%ymm6,%ymm6
> => 0x555730c1 :   vmovdqa %ymm3,(%rcx,%rax,1)
>0x555730c6 :   vmovdqa
> %ymm6,0x20(%rcx,%rax,1)
>0x555730cc :   add$0x40,%rax
>0x555730d0 :   mov%rdi,%rsi
>0x555730d3 :   cmp%r8,%rax
>0x555730d6 :   jb 0x5557304d
> 
>0x555730dc :   vzeroupper
>0x555730df :   retq
>0x555730e0 : push   %r15
> End of assembler dump.
> (gdb) info all-registers
> rax0x0  0
> rbx0x0  0
> rcx0x5583f470   93824995292272
>
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Modern terrorism, a quick summary: Need oil, start war with country that
> has oil, kill hundread thousand in war. Let country fall into chaos,
> be surprised about raise of fundamantalists. Drop more bombs, kill more
> people, be surprised about them taking revenge and drop even more bombs
> and strip your own citizens of their rights and freedoms. to be continued
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-11 Thread Alan Kelly

It's a bug in the patch. The tail not processed by the sse3/avx2 version is
done by the mmx version. I used offset to account for the src pixels
already processed, however, dither is modified if offset is not 0. In cases
where there is a tail and offset is 0, this bug appears. I am working on a
solution.

On Sun, Jan 10, 2021 at 4:26 PM Michael Niedermayer 
wrote:

> On Thu, Jan 07, 2021 at 10:41:19AM +0100, Alan Kelly wrote:
> > ---
> >  Replaces mova with movdqu due to alignment issues
> >  libswscale/x86/Makefile |   1 +
> >  libswscale/x86/swscale.c| 106 +---
> >  libswscale/x86/yuv2yuvX.asm | 117 
> >  tests/checkasm/sw_scale.c   |  98 ++
> >  4 files changed, 246 insertions(+), 76 deletions(-)
> >  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> I have one / some ? cases where this changes output
>  ./ffmpeg -i utvideo-yuv422p10le_UQY2_crc32-A431CD5F.avi -bitexact avi.avi
>
>  i dont know if theres a decoder bug or bug in the patch or something else
>
> -rw-r- 1 michael michael 246218 Jan 10 16:23 avi.avi
> -rw-r- 1 michael michael 245824 Jan 10 16:23 avi-ref.avi
>
> file should be at:
> https://samples.ffmpeg.org/ffmpeg-bugs/trac/ticket4044/
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> In a rich man's house there is no place to spit but his face.
> -- Diogenes of Sinope
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-11 Thread Alan Kelly

---
 Fixes a bug where if there is no offset and a tail which is not processed by 
the
 sse3/avx2 version the dither is modified
 Deletes mmx/mmxext yuv2yuvX version from swscale_template and adds it
 to yuv2yuvX.asm to reduce code duplication and so that it may be used
 to process the tail from the larger cardinal simd versions.
 src argument of yuv2yuvX_* is now srcOffset, so that tails and offsets
 are accounted for correctly.
 Changes input size in checkasm so that this corner case is tested.

 libswscale/x86/Makefile   |   1 +
 libswscale/x86/swscale.c  | 130 
 libswscale/x86/swscale_template.c |  82 --
 libswscale/x86/yuv2yuvX.asm   | 136 ++
 tests/checkasm/sw_scale.c | 100 ++
 5 files changed, 291 insertions(+), 158 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 15c0b22f20..3df193a067 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 
0x8080808080808080ULL;
 DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 
0x0001000100010001ULL;
 
 
+#define YUV2YUVX_FUNC_DECL(opt)  \
+static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const 
int16_t **src, \
+   uint8_t *dest, int dstW, \
+   const uint8_t *dither, int offset); \
+
+YUV2YUVX_FUNC_DECL(mmx)
+YUV2YUVX_FUNC_DECL(mmxext)
+YUV2YUVX_FUNC_DECL(sse3)
+YUV2YUVX_FUNC_DECL(avx2)
+
 //MMX versions
 #if HAVE_MMX_INLINE
 #undef RENAME
@@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov

Re: [FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-14 Thread Alan Kelly

Apologies for this: when I added mmx to the yasm file, I added a macro for
the stores selecting mova for mmx and movdqu for the others. if
cpuflag(mmx) evaluates to true for all architectures so I replaced it with
if notcpuflag(sse3).

The alignment in the checkasm test has been changed to 8 from 32 so that
the test catches problems with alignment.

On Thu, Jan 14, 2021 at 1:11 AM Michael Niedermayer 
wrote:

> On Mon, Jan 11, 2021 at 05:46:31PM +0100, Alan Kelly wrote:
> > ---
> >  Fixes a bug where if there is no offset and a tail which is not
> processed by the
> >  sse3/avx2 version the dither is modified
> >  Deletes mmx/mmxext yuv2yuvX version from swscale_template and adds it
> >  to yuv2yuvX.asm to reduce code duplication and so that it may be used
> >  to process the tail from the larger cardinal simd versions.
> >  src argument of yuv2yuvX_* is now srcOffset, so that tails and offsets
> >  are accounted for correctly.
> >  Changes input size in checkasm so that this corner case is tested.
> >
> >  libswscale/x86/Makefile   |   1 +
> >  libswscale/x86/swscale.c  | 130 
> >  libswscale/x86/swscale_template.c |  82 --
> >  libswscale/x86/yuv2yuvX.asm   | 136 ++
> >  tests/checkasm/sw_scale.c | 100 ++
> >  5 files changed, 291 insertions(+), 158 deletions(-)
> >  create mode 100644 libswscale/x86/yuv2yuvX.asm
>
> This seems to be crashing again unless i messed up testing
>
> (gdb) disassemble $rip-32,$rip+32
> Dump of assembler code from 0x55572f02 to 0x55572f42:
>0x55572f02 :   int$0x71
>0x55572f04 :   out%al,$0x3
>0x55572f06 :   vpsraw $0x3,%ymm1,%ymm1
>0x55572f0b :   vpackuswb %ymm4,%ymm3,%ymm3
>0x55572f0f :   vpackuswb %ymm1,%ymm6,%ymm6
>0x55572f13 :   mov(%rdi),%rdx
>0x55572f16 :   vpermq $0xd8,%ymm3,%ymm3
>0x55572f1c :   vpermq $0xd8,%ymm6,%ymm6
> => 0x55572f22 :   vmovdqa %ymm3,(%rcx,%rax,1)
>0x55572f27 :   vmovdqa
> %ymm6,0x20(%rcx,%rax,1)
>0x55572f2d :   add$0x40,%rax
>0x55572f31 :   mov%rdi,%rsi
>0x55572f34 :   cmp%r8,%rax
>0x55572f37 :   jb 0x55572eae
> 
>0x55572f3d :   vzeroupper
>0x55572f40 :   retq
>0x55572f41 :   nopw   %cs:0x0(%rax,%rax,1)
>
> rax0x0  0
> rbx0x30 48
> rcx0x5583f470   93824995292272
> rdx0x5585e500   93824995419392
>
> #0  0x55572f22 in ff_yuv2yuvX_avx2 ()
> #1  0x555724ee in yuv2yuvX_avx2 ()
> #2  0x5556b4f6 in chr_planar_vscale ()
> #3  0x55566d41 in swscale ()
> #4  0x55568284 in sws_scale ()
>
>
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> What does censorship reveal? It reveals fear. -- Julian Assange
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] Moves yuv2yuvX_sse3 to yasm, unrolls main loop and other small optimizations for ~20% speedup.

2021-01-14 Thread Alan Kelly

---
 Replaces cpuflag(mmx) with notcpuflag(sse3) for store macro
 Tests for multiple sizes in checkasm-sw_scale
 checkasm-sw_scale aligns memory on 8 bytes instad of 32 to catch aligned loads
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/swscale.c  | 130 
 libswscale/x86/swscale_template.c |  82 --
 libswscale/x86/yuv2yuvX.asm   | 136 ++
 tests/checkasm/sw_scale.c | 103 ++
 5 files changed, 294 insertions(+), 158 deletions(-)
 create mode 100644 libswscale/x86/yuv2yuvX.asm

diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index 831d5359aa..bfe383364e 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -13,3 +13,4 @@ X86ASM-OBJS += x86/input.o
  \
x86/scale.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
+   x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 15c0b22f20..3df193a067 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -63,6 +63,16 @@ DECLARE_ASM_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 
0x8080808080808080ULL;
 DECLARE_ASM_ALIGNED(8, const uint64_t, ff_w)= 
0x0001000100010001ULL;
 
 
+#define YUV2YUVX_FUNC_DECL(opt)  \
+static void yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, const 
int16_t **src, \
+   uint8_t *dest, int dstW, \
+   const uint8_t *dither, int offset); \
+
+YUV2YUVX_FUNC_DECL(mmx)
+YUV2YUVX_FUNC_DECL(mmxext)
+YUV2YUVX_FUNC_DECL(sse3)
+YUV2YUVX_FUNC_DECL(avx2)
+
 //MMX versions
 #if HAVE_MMX_INLINE
 #undef RENAME
@@ -198,81 +208,44 @@ void ff_updateMMXDitherTables(SwsContext *c, int dstY)
 }
 
 #if HAVE_MMXEXT
-static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
-   const int16_t **src, uint8_t *dest, int dstW,
-   const uint8_t *dither, int offset)
-{
-if(((uintptr_t)dest) & 15){
-yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
-return;
-}
-filterSize--;
-#define MAIN_FUNCTION \
-"pxor   %%xmm0, %%xmm0 \n\t" \
-"punpcklbw  %%xmm0, %%xmm3 \n\t" \
-"movd   %4, %%xmm1 \n\t" \
-"punpcklwd  %%xmm1, %%xmm1 \n\t" \
-"punpckldq  %%xmm1, %%xmm1 \n\t" \
-"punpcklqdq %%xmm1, %%xmm1 \n\t" \
-"psllw  $3, %%xmm1 \n\t" \
-"paddw  %%xmm1, %%xmm3 \n\t" \
-"psraw  $4, %%xmm3 \n\t" \
-"movdqa %%xmm3, %%xmm4 \n\t" \
-"movdqa %%xmm3, %%xmm7 \n\t" \
-"movl   %3, %%ecx  \n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-".p2align 4 \n\t" /* FIXME 
Unroll? */\
-"1: \n\t"\
-"movddup  8(%%"FF_REG_d"), %%xmm0   \n\t" /* 
filterCoeff */\
-"movdqa  (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* 
srcData */\
-"movdqa16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* 
srcData */\
-"add$16, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
-"pmulhw   %%xmm0, %%xmm2  \n\t"\
-"pmulhw   %%xmm0, %%xmm5  \n\t"\
-"paddw%%xmm2, %%xmm3  \n\t"\
-"paddw%%xmm5, %%xmm4  \n\t"\
-" jnz1b \n\t"\
-"psraw   $3, %%xmm3  \n\t"\
-"psraw   $3, %%xmm4  \n\t"\
-"packuswb %%xmm4, %%xmm3  \n\t"\
-"movntdq  %%xmm3, (%1, %%"FF_REG_c") \n\t"\
-"add $16, %%"FF_REG_c"\n\t"\
-"cmp  %2, %%"FF_REG_c"\n\t"\
-"movdqa   %%xmm7, %%xmm3\n\t" \
-"movdqa   %%xmm7, %%xmm4\n\t" \
-"mov %0, %%"FF_REG_d"\n\t"\
-"mov(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
-"jb  1b \n\t"
-
-if (offset) {
-__asm__ volatile(
-"movq  %5, %%xmm3  \n\t"
-"movdqa%%xmm3

[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-12-14 Thread Alan Kelly

Patch has been rebased from latest commits.
These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 +++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 ++
 tests/checkasm/sw_scale.c |  20 --
 6 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 708facba67..64aa0b9804 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ae92ac9fbc..d4a72d3ce1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 0 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o  \
+   x86/scale_avx2.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 00..d90fd2d791
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@
+;**

Re: [FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-12-15 Thread Alan Kelly

On Tue, Dec 14, 2021 at 6:07 PM James Almer  wrote:

> On 12/14/2021 12:23 PM, Alan Kelly wrote:
> > Patch has been rebased from latest commits.
> > These functions replace all ff_hscale8to15_*_ssse3 when avx2 is
> available.
> > ---
> >   libswscale/swscale_internal.h |   2 +
> >   libswscale/utils.c|  37 +++
> >   libswscale/x86/Makefile   |   1 +
> >   libswscale/x86/scale_avx2.asm | 112 ++
> >   libswscale/x86/swscale.c  |  19 ++
> >   tests/checkasm/sw_scale.c |  20 --
> >   6 files changed, 186 insertions(+), 5 deletions(-)
> >   create mode 100644 libswscale/x86/scale_avx2.asm
> >
> > diff --git a/libswscale/swscale_internal.h
> b/libswscale/swscale_internal.h
> > index 708facba67..64aa0b9804 100644
> > --- a/libswscale/swscale_internal.h
> > +++ b/libswscale/swscale_internal.h
> > @@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr,
> int threadnr,
> >   //number of extra lines to process
> >   #define MAX_LINES_AHEAD 4
> >
> > +//shuffle filter and filterPos for hyScale and hcScale filters in avx2
> > +void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
> >   #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> > diff --git a/libswscale/utils.c b/libswscale/utils.c
> > index ae92ac9fbc..d4a72d3ce1 100644
> > --- a/libswscale/utils.c
> > +++ b/libswscale/utils.c
> > @@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = {
> >   [AV_PIX_FMT_P416LE]  = { 1, 0 },
> >   };
> >
> > +void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
> filterSize, int16_t *filter, int dstW){
> > +#if ARCH_X86_64
> > +int i, j, k, l;
> > +int cpu_flags = av_get_cpu_flags();
> > +if (EXTERNAL_AVX2_FAST(cpu_flags)){
> > +if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> > +if (dstW % 16 == 0){
> > +if (filter != NULL){
> > +for (i = 0; i < dstW; i += 8){
> > +FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
> > +FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
> > +}
> > +if (filterSize > 4){
> > +int16_t *tmp2 = av_malloc(dstW * filterSize *
> 2);
> > +memcpy(tmp2, filter, dstW * filterSize * 2);
> > +for (i = 0; i < dstW; i += 16){//pixel
> > +for (k = 0; k < filterSize / 4;
> ++k){//fcoeff
> > +for (j = 0; j < 16; ++j){//inner pixel
> > +for (l = 0; l < 4; ++l){//coeff
> > +int from = i * filterSize + j *
> filterSize + k * 4 + l;
> > +int to = (i) * filterSize + j *
> 4 + l + k * 64;
> > +filter[to] = tmp2[from];
> > +}
> > +}
> > +}
> > +}
> > +av_free(tmp2);
> > +}
> > +}
> > +}
> > +}
> > +}
> > +#endif
> > +}
> > +
> >   int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
> >   {
> >   return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
> > @@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
> >  get_local_pos(c, 0, 0, 0),
> >  get_local_pos(c, 0, 0, 0))) < 0)
> >   goto fail;
> > +ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
> c->hLumFilterSize, c->hLumFilter, dstW);
> >   if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
> >  &c->hChrFilterSize, c->chrXInc,
> >  c->chrSrcW, c->chrDstW, filterAlign, 1 <<
> 14,
> > @@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
> >  get_local_pos(c, c->chrSrcHSubSample,
> c->src_h_chr_pos, 0),
> >  get_local_pos(c, c->chrDstHSubSample,
> c->dst_h_chr_pos, 0))) < 0)
> >   goto fail;
> > +ff_sh

[FFmpeg-devel] [PATCH 2/2] libswscale: Adds ff_hscale8to15_4_avx2 and ff_hscale8to15_X4_avx2 for all filter sizes.

2021-12-15 Thread Alan Kelly

Fixes so that fate under 64 bit Windows passes.

These functions replace all ff_hscale8to15_*_ssse3 when avx2 is available.
---
 libswscale/swscale_internal.h |   2 +
 libswscale/utils.c|  37 +++
 libswscale/x86/Makefile   |   1 +
 libswscale/x86/scale_avx2.asm | 112 ++
 libswscale/x86/swscale.c  |  19 ++
 tests/checkasm/sw_scale.c |  20 --
 6 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 libswscale/x86/scale_avx2.asm

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 708facba67..64aa0b9804 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1105,4 +1105,6 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 //number of extra lines to process
 #define MAX_LINES_AHEAD 4
 
+//shuffle filter and filterPos for hyScale and hcScale filters in avx2
+void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index ae92ac9fbc..d4a72d3ce1 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,6 +278,41 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 0 },
 };
 
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+#if ARCH_X86_64
+int i, j, k, l;
+int cpu_flags = av_get_cpu_flags();
+if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
+if (dstW % 16 == 0){
+if (filter != NULL){
+for (i = 0; i < dstW; i += 8){
+FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
+FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
+}
+if (filterSize > 4){
+int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+memcpy(tmp2, filter, dstW * filterSize * 2);
+for (i = 0; i < dstW; i += 16){//pixel
+for (k = 0; k < filterSize / 4; ++k){//fcoeff
+for (j = 0; j < 16; ++j){//inner pixel
+for (l = 0; l < 4; ++l){//coeff
+int from = i * filterSize + j * 
filterSize + k * 4 + l;
+int to = (i) * filterSize + j * 4 + l 
+ k * 64;
+filter[to] = tmp2[from];
+}
+}
+}
+}
+av_free(tmp2);
+}
+}
+}
+}
+}
+#endif
+}
+
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
 {
 return (unsigned)pix_fmt < FF_ARRAY_ELEMS(format_entries) ?
@@ -1801,6 +1836,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1810,6 +1846,7 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
+ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
 }
 } // initialize horizontal stuff
 
diff --git a/libswscale/x86/Makefile b/libswscale/x86/Makefile
index bfe383364e..68391494be 100644
--- a/libswscale/x86/Makefile
+++ b/libswscale/x86/Makefile
@@ -11,6 +11,7 @@ OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 X86ASM-OBJS += x86/input.o  \
x86/output.o \
x86/scale.o  \
+   x86/scale_avx2.o  \
x86/rgb_2_rgb.o  \
x86/yuv_2_rgb.o  \
x86/yuv2yuvX.o   \
diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
new file mode 100644
index 00..4e4fe5d794
--- /dev/null
+++ b/libswscale/x86/scale_avx2.asm
@@ -0,0 +1,112 @@
+;*

[FFmpeg-devel] [PATCH] x86/swscale: fix minor coding style issues

2021-12-16 Thread Alan Kelly

---
 libswscale/x86/swscale.c  | 14 +++---
 tests/checkasm/sw_scale.c |  3 +--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 164b06d6ba..c49a05c37b 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -578,13 +578,13 @@ switch(c->dstBpc){ \
  break; \
 }
 
-if (EXTERNAL_AVX2_FAST(cpu_flags)){
-  if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if(c->chrDstW % 16 == 0)
-  ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if(c->dstW % 16 == 0)
-  ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
-  }
+if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+if (c->chrDstW % 16 == 0)
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+if (c->dstW % 16 == 0)
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+}
 }
 
 if (EXTERNAL_AVX2_FAST(cpu_flags)) {
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 011cb46428..f4912e6c2c 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -217,9 +217,8 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if (cpu_flags & AV_CPU_FLAG_AVX2){
+if (cpu_flags & AV_CPU_FLAG_AVX2)
 ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
-}
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH] x86/swscale: fix minor coding style issues

2021-12-16 Thread Alan Kelly

Thanks Lynne for the patch.

On Thu, Dec 16, 2021 at 5:05 PM Alan Kelly  wrote:

> ---
>  libswscale/x86/swscale.c  | 14 +++---
>  tests/checkasm/sw_scale.c |  3 +--
>  2 files changed, 8 insertions(+), 9 deletions(-)
>
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 164b06d6ba..c49a05c37b 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -578,13 +578,13 @@ switch(c->dstBpc){ \
>   break; \
>  }
>
> -if (EXTERNAL_AVX2_FAST(cpu_flags)){
> -  if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> -if(c->chrDstW % 16 == 0)
> -  ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> -if(c->dstW % 16 == 0)
> -  ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
> -  }
> +if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
> +if (c->chrDstW % 16 == 0)
> +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> +if (c->dstW % 16 == 0)
> +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
> +}
>  }
>
>  if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
> index 011cb46428..f4912e6c2c 100644
> --- a/tests/checkasm/sw_scale.c
> +++ b/tests/checkasm/sw_scale.c
> @@ -217,9 +217,8 @@ static void check_hscale(void)
>  }
>  ff_sws_init_scale(ctx);
>  memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS *
> MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
> -if (cpu_flags & AV_CPU_FLAG_AVX2){
> +if (cpu_flags & AV_CPU_FLAG_AVX2)
>  ff_shuffle_filter_coefficients(ctx, filterPosAvx, width,
> filterAvx2, SRC_PIXELS);
> -}
>
>  if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d",
> ctx->srcBpc, ctx->dstBpc + 1, width)) {
>  memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
> --
> 2.34.1.173.g76aa8bc2d0-goog
>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] x86/scale_avx2: Change asm indent from 2 to 4 spaces.

2021-12-16 Thread Alan Kelly

---
 libswscale/x86/scale_avx2.asm | 96 +--
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 2cd7e968d3..eb472db12f 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -45,63 +45,63 @@ SECTION .text
 
 %macro SCALE_FUNC 1
 cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, filter, fltpos, 
fltsize, count, inner
-  pxor m0, m0
-  mova m15, [swizzle]
-  mov countq, $0
-  movsxd wq, wd
+pxor m0, m0
+mova m15, [swizzle]
+mov countq, $0
+movsxd wq, wd
 %ifidn %1, X4
-  mova m14, [four]
-  shr fltsized, 2
+mova m14, [four]
+shr fltsized, 2
 %endif
 .loop:
-  movu m1, [fltposq]
-  movu m2, [fltposq+32]
+movu m1, [fltposq]
+movu m2, [fltposq+32]
 %ifidn %1, X4
-  pxor m9, m9
-  pxor m10, m10
-  pxor m11, m11
-  pxor m12, m12
-  mov innerq, $0
+pxor m9, m9
+pxor m10, m10
+pxor m11, m11
+pxor m12, m12
+mov innerq, $0
 .innerloop:
 %endif
-  vpcmpeqd  m13, m13
-  vpgatherdd m3,[srcmemq + m1], m13
-  vpcmpeqd  m13, m13
-  vpgatherdd m4,[srcmemq + m2], m13
-  vpunpcklbw m5, m3, m0
-  vpunpckhbw m6, m3, m0
-  vpunpcklbw m7, m4, m0
-  vpunpckhbw m8, m4, m0
-  vpmaddwd m5, m5, [filterq]
-  vpmaddwd m6, m6, [filterq + 32]
-  vpmaddwd m7, m7, [filterq + 64]
-  vpmaddwd m8, m8, [filterq + 96]
-  add filterq, $80
+vpcmpeqd  m13, m13
+vpgatherdd m3,[srcmemq + m1], m13
+vpcmpeqd  m13, m13
+vpgatherdd m4,[srcmemq + m2], m13
+vpunpcklbw m5, m3, m0
+vpunpckhbw m6, m3, m0
+vpunpcklbw m7, m4, m0
+vpunpckhbw m8, m4, m0
+vpmaddwd m5, m5, [filterq]
+vpmaddwd m6, m6, [filterq + 32]
+vpmaddwd m7, m7, [filterq + 64]
+vpmaddwd m8, m8, [filterq + 96]
+add filterq, $80
 %ifidn %1, X4
-  paddd m9, m5
-  paddd m10, m6
-  paddd m11, m7
-  paddd m12, m8
-  paddd m1, m14
-  paddd m2, m14
-  add innerq, $1
-  cmp innerq, fltsizeq
-  jl .innerloop
-  vphaddd m5, m9, m10
-  vphaddd m6, m11, m12
+paddd m9, m5
+paddd m10, m6
+paddd m11, m7
+paddd m12, m8
+paddd m1, m14
+paddd m2, m14
+add innerq, $1
+cmp innerq, fltsizeq
+jl .innerloop
+vphaddd m5, m9, m10
+vphaddd m6, m11, m12
 %else
-  vphaddd m5, m5, m6
-  vphaddd m6, m7, m8
+vphaddd m5, m5, m6
+vphaddd m6, m7, m8
 %endif
-  vpsrad  m5, 7
-  vpsrad  m6, 7
-  vpackssdw m5, m5, m6
-  vpermd m5, m15, m5
-  vmovdqu [dstq + countq * 2], m5
-  add fltposq, $40
-  add countq, $10
-  cmp countq, wq
-  jl .loop
+vpsrad  m5, 7
+vpsrad  m6, 7
+vpackssdw m5, m5, m6
+vpermd m5, m15, m5
+vmovdqu [dstq + countq * 2], m5
+add fltposq, $40
+add countq, $10
+cmp countq, wq
+jl .loop
 REP_RET
 %endmacro
 
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-20 Thread Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 As discussed on IRC last week.
 libavutil/cpu.h | 57 +++--
 libavutil/x86/cpu.c | 13 ++-
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..4272d11d73 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -26,34 +26,35 @@
 #define AV_CPU_FLAG_FORCE0x8000 /* force usage of selected flags (OR) 
*/
 
 /* lower 16 bits - CPU features */
-#define AV_CPU_FLAG_MMX  0x0001 ///< standard MMX
-#define AV_CPU_FLAG_MMXEXT   0x0002 ///< SSE integer functions or AMD MMX 
ext
-#define AV_CPU_FLAG_MMX2 0x0002 ///< SSE integer functions or AMD MMX 
ext
-#define AV_CPU_FLAG_3DNOW0x0004 ///< AMD 3DNOW
-#define AV_CPU_FLAG_SSE  0x0008 ///< SSE functions
-#define AV_CPU_FLAG_SSE2 0x0010 ///< PIV SSE2 functions
-#define AV_CPU_FLAG_SSE2SLOW 0x4000 ///< SSE2 supported, but usually not 
faster
-///< than regular MMX/SSE (e.g. Core1)
-#define AV_CPU_FLAG_3DNOWEXT 0x0020 ///< AMD 3DNowExt
-#define AV_CPU_FLAG_SSE3 0x0040 ///< Prescott SSE3 functions
-#define AV_CPU_FLAG_SSE3SLOW 0x2000 ///< SSE3 supported, but usually not 
faster
-///< than regular MMX/SSE (e.g. Core1)
-#define AV_CPU_FLAG_SSSE30x0080 ///< Conroe SSSE3 functions
-#define AV_CPU_FLAG_SSSE3SLOW 0x400 ///< SSSE3 supported, but usually not 
faster
-#define AV_CPU_FLAG_ATOM 0x1000 ///< Atom processor, some SSSE3 
instructions are slower
-#define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions
-#define AV_CPU_FLAG_SSE420x0200 ///< Nehalem SSE4.2 functions
-#define AV_CPU_FLAG_AESNI   0x8 ///< Advanced Encryption Standard 
functions
-#define AV_CPU_FLAG_AVX  0x4000 ///< AVX functions: requires OS 
support even if YMM registers aren't used
-#define AV_CPU_FLAG_AVXSLOW   0x800 ///< AVX supported, but slow when 
using YMM registers (e.g. Bulldozer)
-#define AV_CPU_FLAG_XOP  0x0400 ///< Bulldozer XOP functions
-#define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions
-#define AV_CPU_FLAG_CMOV 0x1000 ///< supports cmov instruction
-#define AV_CPU_FLAG_AVX2 0x8000 ///< AVX2 functions: requires OS 
support even if YMM registers aren't used
-#define AV_CPU_FLAG_FMA30x1 ///< Haswell FMA3 functions
-#define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
-#define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
-#define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS 
support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_MMX 0x0001 ///< standard MMX
+#define AV_CPU_FLAG_MMXEXT  0x0002 ///< SSE integer functions or AMD 
MMX ext
+#define AV_CPU_FLAG_MMX20x0002 ///< SSE integer functions or AMD 
MMX ext
+#define AV_CPU_FLAG_3DNOW   0x0004 ///< AMD 3DNOW
+#define AV_CPU_FLAG_SSE 0x0008 ///< SSE functions
+#define AV_CPU_FLAG_SSE20x0010 ///< PIV SSE2 functions
+#define AV_CPU_FLAG_SSE2SLOW0x4000 ///< SSE2 supported, but usually 
not faster
+   ///< than regular MMX/SSE (e.g. 
Core1)
+#define AV_CPU_FLAG_3DNOWEXT0x0020 ///< AMD 3DNowExt
+#define AV_CPU_FLAG_SSE30x0040 ///< Prescott SSE3 functions
+#define AV_CPU_FLAG_SSE3SLOW0x2000 ///< SSE3 supported, but usually 
not faster
+   ///< than regular MMX/SSE (e.g. 
Core1)
+#define AV_CPU_FLAG_SSSE3   0x0080 ///< Conroe SSSE3 functions
+#define AV_CPU_FLAG_SSSE3SLOW0x400 ///< SSSE3 supported, but usually 
not faster
+#define AV_CPU_FLAG_ATOM0x1000 ///< Atom processor, some SSSE3 
instructions are slower
+#define AV_CPU_FLAG_SSE40x0100 ///< Penryn SSE4.1 functions
+#define AV_CPU_FLAG_SSE42   0x0200 ///< Nehalem SSE4.2 functions
+#define AV_CPU_FLAG_AESNI  0x8 ///< Advanced Encryption Standard 
functions
+#define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS 
support even if YMM registers aren't used
+#define AV_CPU_FLAG_AVXSLOW  0x800 ///< AVX supported, but slow when 
using YMM registers (e.g. Bulldozer)
+#define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions
+#define AV_CPU_FLAG_FMA40x0800 ///< Bulldozer FMA4 functions
+#define AV_CPU_FLAG_CMOV0x1000 ///< supports cmov instruction
+#define AV_CPU_FLAG_AVX20x8000 ///< AVX2 functions: requires OS 
support even if YMM registers aren't used
+#define AV_CPU_FLAG_FMA3   0x1 ///< Haswell FMA3 functions
+#define AV_CPU_FLAG_BMI1   0x2 ///< Bit Manipulation Instruction 
Set 1
+#define AV_CPU_FLAG_BMI2   0x4 ///< Bit Manipulation

[FFmpeg-devel] [PATCH 2/2] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.

2021-12-20 Thread Alan Kelly

This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions
are only used where they are faster.
---
 libswscale/utils.c| 2 +-
 libswscale/x86/swscale.c  | 2 +-
 tests/checkasm/sw_scale.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index d4a72d3ce1..9a69b45afe 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 #if ARCH_X86_64
 int i, j, k, l;
 int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
 if (dstW % 16 == 0){
 if (filter != NULL){
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index c49a05c37b..eb5334a2be 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -578,7 +578,7 @@ switch(c->dstBpc){ \
  break; \
 }
 
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
 if (c->chrDstW % 16 == 0)
 ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index f4912e6c2c..680562af08 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -217,7 +217,7 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if (cpu_flags & AV_CPU_FLAG_AVX2)
+if (cpu_flags & AV_CPU_FLAG_SLOW_GATHER)
 ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-20 Thread Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Removes unnecessary indentation, clarifies comment and only sets flag on AMD
 cpus with AVX2.
 libavutil/cpu.h |  1 +
 libavutil/x86/cpu.c | 14 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS 
support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x200 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC  0x0001 ///< standard
 #define AV_CPU_FLAG_VSX  0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..563984f234 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) {
 rval |= AV_CPU_FLAG_AVX2;
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+/* Haswell has slow gather */
+if(family == 6 && model < 70)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
@@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
 if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
 rval |= AV_CPU_FLAG_AVXSLOW;
+
+/* AMD cpus have slow gather */
+if(rval & AV_CPU_FLAG_AVX2)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
 }
 
 /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/2] libswscale: Test AV_CPU_FLAG_SLOW_GATHER for hscale functions.

2021-12-20 Thread Alan Kelly

This is instead of EXTERNAL_AVX2_FAST so that the avx2 hscale functions
are only used where they are faster.
---
 Whoops! Corrects check so that this flag is only enabled where fast
 avx2 and fast gathers are available.
 libswscale/utils.c| 2 +-
 libswscale/x86/swscale.c  | 2 +-
 tests/checkasm/sw_scale.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index d4a72d3ce1..7158384f0b 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -282,7 +282,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 #if ARCH_X86_64
 int i, j, k, l;
 int cpu_flags = av_get_cpu_flags();
-if (EXTERNAL_AVX2_FAST(cpu_flags)){
+if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
 if (dstW % 16 == 0){
 if (filter != NULL){
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index c49a05c37b..ffc7691c12 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -578,7 +578,7 @@ switch(c->dstBpc){ \
  break; \
 }
 
-if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
 if (c->chrDstW % 16 == 0)
 ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index f4912e6c2c..3c0a083b42 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -217,7 +217,7 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if (cpu_flags & AV_CPU_FLAG_AVX2)
+if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
 ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-20 Thread Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Sets this flag on Zen 3 and earlier.
 libavutil/cpu.h |  1 +
 libavutil/x86/cpu.c | 14 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS 
support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x200 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC  0x0001 ///< standard
 #define AV_CPU_FLAG_VSX  0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..11467ba99d 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) {
 rval |= AV_CPU_FLAG_AVX2;
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+/* Haswell has slow gather */
+if(family == 6 && model < 70)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
@@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
 if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
 rval |= AV_CPU_FLAG_AVXSLOW;
+
+/* Zen 3 and earlier have slow gather */
+if((rval & AV_CPU_FLAG_AVX2) & family <= 25)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
 }
 
 /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-20 Thread Alan Kelly

On Mon, Dec 20, 2021 at 3:53 PM James Almer  wrote:

>
>
> On 12/20/2021 11:47 AM, Lynne wrote:
> > 20 Dec 2021, 15:43 by alankelly-at-google@ffmpeg.org:
> >
> >> This flag is set on Haswell and earlier and all AMD cpus.
> >> ---
> >>   Removes unnecessary indentation, clarifies comment and only sets flag
> on AMD
> >>   cpus with AVX2.
> >>   libavutil/cpu.h |  1 +
> >>   libavutil/x86/cpu.c | 14 +-
> >>   2 files changed, 14 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/libavutil/cpu.h b/libavutil/cpu.h
> >> index ae443eccad..ce9bf14bf7 100644
> >> --- a/libavutil/cpu.h
> >> +++ b/libavutil/cpu.h
> >> @@ -54,6 +54,7 @@
> >>   #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation
> Instruction Set 1
> >>   #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation
> Instruction Set 2
> >>   #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions:
> requires OS support even if YMM/ZMM registers aren't used
> >> +#define AV_CPU_FLAG_SLOW_GATHER  0x200 ///< CPU has slow gathers.
> >>
> >>   #define AV_CPU_FLAG_ALTIVEC  0x0001 ///< standard
> >>   #define AV_CPU_FLAG_VSX  0x0002 ///< ISA 2.06
> >> diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
> >> index bcd41a50a2..563984f234 100644
> >> --- a/libavutil/x86/cpu.c
> >> +++ b/libavutil/x86/cpu.c
> >> @@ -146,8 +146,16 @@ int ff_get_cpu_flags_x86(void)
> >>   if (max_std_level >= 7) {
> >>   cpuid(7, eax, ebx, ecx, edx);
> >>   #if HAVE_AVX2
> >> -if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
> >> +if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) {
> >>   rval |= AV_CPU_FLAG_AVX2;
> >> +cpuid(1, eax, ebx, ecx, std_caps);
> >> +family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
> >> +model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
> >> +/* Haswell has slow gather */
> >> +if(family == 6 && model < 70)
> >> +rval |= AV_CPU_FLAG_SLOW_GATHER;
> >> +}
> >> +
> >>   #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
> >>   if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
> >>   if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
> >> @@ -196,6 +204,10 @@ int ff_get_cpu_flags_x86(void)
> >>   used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW. */
> >>   if ((family == 0x15 || family == 0x16) && (rval & AV_CPU_FLAG_AVX))
> >>   rval |= AV_CPU_FLAG_AVXSLOW;
> >> +
> >> +/* AMD cpus have slow gather */
> >> +if(rval & AV_CPU_FLAG_AVX2)
> >> +rval |= AV_CPU_FLAG_SLOW_GATHER;
> >>   }
> >>
> >
> > No, I'd rather limit AMD CPUs to all currently released CPUs.
> > Future ones are getting AVX512, which did speed up gathers on
> > Intel CPUs, as the ISA extension extended gathers and addded
> > scatters.
>
> I wouldn't hold my breath for that, but it's probably a good idea
> anyway. A check so it's flagged only on Excavator and Zen <= 3.
>
> >
> > Also your previous patch introduces ff_shuffle_filter_coefficients()
> > which is so bad it pretty much needs a complete rewrite.
> > You're also not detecting malloc errors or propagating them back.
>
> That's unrelated to this patch.
>
> >
> > ___
> > ffmpeg-devel mailing list
> > ffmpeg-devel@ffmpeg.org
> > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> >
> > To unsubscribe, visit link above, or email
> > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>

Updated patch sent with check for family <= 25 so that future CPUs will
have avx2 hscale enabled by default.

I may have time this week to look at ff_shuffle_filter_coefficients.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/2] libavutil/cpu: Add AV_CPU_FLAG_SLOW_GATHER.

2021-12-21 Thread Alan Kelly

This flag is set on Haswell and earlier and all AMD cpus.
---
 Checks for family for Haswell. All checks are done where AVX2 flag is
 set as this is clearer.
 libavutil/cpu.h |  1 +
 libavutil/x86/cpu.c | 15 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/libavutil/cpu.h b/libavutil/cpu.h
index ae443eccad..ce9bf14bf7 100644
--- a/libavutil/cpu.h
+++ b/libavutil/cpu.h
@@ -54,6 +54,7 @@
 #define AV_CPU_FLAG_BMI10x2 ///< Bit Manipulation Instruction Set 1
 #define AV_CPU_FLAG_BMI20x4 ///< Bit Manipulation Instruction Set 2
 #define AV_CPU_FLAG_AVX512 0x10 ///< AVX-512 functions: requires OS 
support even if YMM/ZMM registers aren't used
+#define AV_CPU_FLAG_SLOW_GATHER  0x200 ///< CPU has slow gathers.
 
 #define AV_CPU_FLAG_ALTIVEC  0x0001 ///< standard
 #define AV_CPU_FLAG_VSX  0x0002 ///< ISA 2.06
diff --git a/libavutil/x86/cpu.c b/libavutil/x86/cpu.c
index bcd41a50a2..441b4695d5 100644
--- a/libavutil/x86/cpu.c
+++ b/libavutil/x86/cpu.c
@@ -146,8 +146,21 @@ int ff_get_cpu_flags_x86(void)
 if (max_std_level >= 7) {
 cpuid(7, eax, ebx, ecx, edx);
 #if HAVE_AVX2
-if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020))
+if ((rval & AV_CPU_FLAG_AVX) && (ebx & 0x0020)) {
 rval |= AV_CPU_FLAG_AVX2;
+cpuid(1, eax, ebx, ecx, std_caps);
+family = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
+model  = ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0);
+/* Haswell has slow gather */
+if (!strncmp(vendor.c, "GenuineIntel", 12))
+if (family == 6 && model < 70)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+/* Zen 3 and earlier have slow gather */
+if (!strncmp(vendor.c, "AuthenticAMD", 12))
+if (family <= 0x19)
+rval |= AV_CPU_FLAG_SLOW_GATHER;
+}
+
 #if HAVE_AVX512 /* F, CD, BW, DQ, VL */
 if ((xcr0_lo & 0xe0) == 0xe0) { /* OPMASK/ZMM state */
 if ((rval & AV_CPU_FLAG_AVX2) && (ebx & 0xd003) == 0xd003)
-- 
2.34.1.173.g76aa8bc2d0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-01-10 Thread Alan Kelly

Make the code more readable, follow the style guide and propagate memory
allocation errors.
---
 libswscale/swscale_internal.h |  2 +-
 libswscale/utils.c| 68 ---
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 3a78d95ba6..26d28d42e6 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 #define MAX_LINES_AHEAD 4
 
 //shuffle filter and filterPos for hyScale and hcScale filters in avx2
-void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
+int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index c5ea8853d5..52f07e1661 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
+   int filterSize, int16_t *filter,
+   int dstW)
+{
 #if ARCH_X86_64
-int i, j, k, l;
+int i = 0, j = 0, k = 0;
 int cpu_flags = av_get_cpu_flags();
+if (!filter || dstW % 16 != 0) return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
-if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if (dstW % 16 == 0){
-if (filter != NULL){
-for (i = 0; i < dstW; i += 8){
-FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
-FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
-}
-if (filterSize > 4){
-int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
-memcpy(tmp2, filter, dstW * filterSize * 2);
-for (i = 0; i < dstW; i += 16){//pixel
-for (k = 0; k < filterSize / 4; ++k){//fcoeff
-for (j = 0; j < 16; ++j){//inner pixel
-for (l = 0; l < 4; ++l){//coeff
-int from = i * filterSize + j * 
filterSize + k * 4 + l;
-int to = (i) * filterSize + j * 4 + l 
+ k * 64;
-filter[to] = tmp2[from];
-}
-}
-}
-}
-av_free(tmp2);
-}
-}
-}
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+   int16_t *filterCopy = NULL;
+   if (filterSize > 4) {
+   if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
+   return AVERROR(ENOMEM);
+   memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t));
+   }
+   // Do not swap filterPos for pixels which won't be processed by
+   // the main loop.
+   for (i = 0; i + 8 <= dstW; i += 8) {
+   FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
+   FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   }
+   if (filterSize > 4) {
+   // 16 pixels are processed at a time.
+   for (i = 0; i + 16 <= dstW; i += 16) {
+   // 4 filter coeffs are processed at a time.
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < 16; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 16;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
+   }
+   if (filterCopy)
+   av_free(filterCopy);
 }
 }
 #endif
+return 0;
 }
 
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
@@ -1836,7 +1844,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
+if ((ret = ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW)) != 0)
+goto nomem;
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,

[FFmpeg-devel] [PATCH 2/4] libswscale: Avx2 hscale can process any input of size which is a multiple of 4.

2022-01-10 Thread Alan Kelly

The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
---
 libswscale/x86/scale_avx2.asm | 48 +--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..dc42abb100 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 mova m14, [four]
 shr fltsized, 2
 %endif
+cmp wq, 16
+jl .tail_loop
+mov countq, 0x10
 .loop:
 movu m1, [fltposq]
 movu m2, [fltposq+32]
@@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 vpsrad  m6, 7
 vpackssdw m5, m5, m6
 vpermd m5, m15, m5
-vmovdqu [dstq + countq * 2], m5
+vmovdqu [dstq], m5
+add dstq, 0x20
 add fltposq, 0x40
 add countq, 0x10
 cmp countq, wq
-jl .loop
+jle .loop
+
+sub countq, 0x10
+cmp countq, wq
+jge .end
+
+.tail_loop:
+movu xm1, [fltposq]
+%ifidn %1, X4
+pxor xm9, xm9
+pxor xm10, xm10
+xor innerq, innerq
+.tail_innerloop:
+%endif
+vpcmpeqd  xm13, xm13
+vpgatherdd xm3,[srcmemq + xm1], xm13
+vpunpcklbw xm5, xm3, xm0
+vpunpckhbw xm6, xm3, xm0
+vpmaddwd xm5, xm5, [filterq]
+vpmaddwd xm6, xm6, [filterq + 16]
+add filterq, 0x20
+%ifidn %1, X4
+paddd xm9, xm5
+paddd xm10, xm6
+paddd xm1, xm14
+add innerq, 1
+cmp innerq, fltsizeq
+jl .tail_innerloop
+vphaddd xm5, xm9, xm10
+%else
+vphaddd xm5, xm5, xm6
+%endif
+vpsrad  xm5, 7
+vpackssdw xm5, xm5, xm5
+vmovq [dstq], xm5
+add dstq, 0x8
+add fltposq, 0x10
+add countq, 0x4
+cmp countq, wq
+jl .tail_loop
+.end:
 REP_RET
 %endmacro
 
-- 
2.34.1.575.g55b058a8bb-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/4] libswscale: Enable hscale_avx2 for input sizes which ar emultiples of 4.

2022-01-10 Thread Alan Kelly

ff_shuffle_filter_coefficients shuffles the tail as required.
---
 libswscale/utils.c   | 17 +++--
 libswscale/x86/swscale.c |  4 ++--
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 52f07e1661..7e1e9c3834 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -285,7 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 #if ARCH_X86_64
 int i = 0, j = 0, k = 0;
 int cpu_flags = av_get_cpu_flags();
-if (!filter || dstW % 16 != 0) return 0;
+if (!filter || (dstW % 4 != 0)) return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
int16_t *filterCopy = NULL;
@@ -296,9 +296,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
// Do not swap filterPos for pixels which won't be processed by
// the main loop.
-   for (i = 0; i + 8 <= dstW; i += 8) {
+   for (i = 0; i + 16 <= dstW; i += 16) {
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
+   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
}
if (filterSize > 4) {
// 16 pixels are processed at a time.
@@ -312,6 +314,17 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
}
}
+   // 4 pixels are processed at a time in the tail.
+   for (; i + 4 <= dstW; i += 4) {
+   // 4 filter coeffs are processed at a time.
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < 4; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 4;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
}
if (filterCopy)
av_free(filterCopy);
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index fdc93866a6..1d8f19aa5a 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -580,9 +580,9 @@ switch(c->dstBpc){ \
 
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
-if (c->chrDstW % 16 == 0)
+if (c->chrDstW % 4 == 0)
 ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if (c->dstW % 16 == 0)
+if (c->dstW % 4 == 0)
 ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
 }
 }
-- 
2.34.1.575.g55b058a8bb-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/4] checkasm/sw_scale: hscale does not requires cpuflag test.

2022-01-10 Thread Alan Kelly

This is done in ff_shuffle_filter_coefficients.
---
 tests/checkasm/sw_scale.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 3c0a083b42..e7f916d3a8 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -168,8 +168,6 @@ static void check_hscale(void)
   const uint8_t *src, const int16_t *filter,
   const int32_t *filterPos, int filterSize);
 
-int cpu_flags = av_get_cpu_flags();
-
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
@@ -215,10 +213,10 @@ static void check_hscale(void)
 
 filter[SRC_PIXELS * width + i] = rnd();
 }
+
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.34.1.575.g55b058a8bb-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-02-02 Thread Alan Kelly

Hi,

Is anybody interested in this patch set?

Thanks!

On Mon, Jan 10, 2022, 15:58 Alan Kelly  wrote:

> Make the code more readable, follow the style guide and propagate memory
> allocation errors.
> ---
>  libswscale/swscale_internal.h |  2 +-
>  libswscale/utils.c| 68 ---
>  2 files changed, 40 insertions(+), 30 deletions(-)
>
> diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
> index 3a78d95ba6..26d28d42e6 100644
> --- a/libswscale/swscale_internal.h
> +++ b/libswscale/swscale_internal.h
> @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int
> threadnr,
>  #define MAX_LINES_AHEAD 4
>
>  //shuffle filter and filterPos for hyScale and hcScale filters in avx2
> -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
> +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
>  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index c5ea8853d5..52f07e1661 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = {
>  [AV_PIX_FMT_P416LE]  = { 1, 1 },
>  };
>
> -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
> filterSize, int16_t *filter, int dstW){
> +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
> +   int filterSize, int16_t *filter,
> +   int dstW)
> +{
>  #if ARCH_X86_64
> -int i, j, k, l;
> +int i = 0, j = 0, k = 0;
>  int cpu_flags = av_get_cpu_flags();
> +if (!filter || dstW % 16 != 0) return 0;
>  if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags &
> AV_CPU_FLAG_SLOW_GATHER)) {
> -if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
> -if (dstW % 16 == 0){
> -if (filter != NULL){
> -for (i = 0; i < dstW; i += 8){
> -FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
> -FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
> -}
> -if (filterSize > 4){
> -int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
> -memcpy(tmp2, filter, dstW * filterSize * 2);
> -for (i = 0; i < dstW; i += 16){//pixel
> -for (k = 0; k < filterSize / 4; ++k){//fcoeff
> -for (j = 0; j < 16; ++j){//inner pixel
> -for (l = 0; l < 4; ++l){//coeff
> -int from = i * filterSize + j *
> filterSize + k * 4 + l;
> -int to = (i) * filterSize + j * 4
> + l + k * 64;
> -filter[to] = tmp2[from];
> -}
> -}
> -}
> -}
> -av_free(tmp2);
> -}
> -}
> -}
> +if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
> +   int16_t *filterCopy = NULL;
> +   if (filterSize > 4) {
> +   if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
> +   return AVERROR(ENOMEM);
> +   memcpy(filterCopy, filter, dstW * filterSize *
> sizeof(int16_t));
> +   }
> +   // Do not swap filterPos for pixels which won't be processed by
> +   // the main loop.
> +   for (i = 0; i + 8 <= dstW; i += 8) {
> +   FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
> +   FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
> +   }
> +   if (filterSize > 4) {
> +   // 16 pixels are processed at a time.
> +   for (i = 0; i + 16 <= dstW; i += 16) {
> +   // 4 filter coeffs are processed at a time.
> +   for (k = 0; k + 4 <= filterSize; k += 4) {
> +   for (j = 0; j < 16; ++j) {
> +   int from = (i + j) * filterSize + k;
> +   int to = i * filterSize + j * 4 + k * 16;
> +   memcpy(&filter[to], &filterCopy[from], 4 *
> sizeof(int16_t));
> +   }
> +   }
> +   }
> +   }
> +   if (filterCopy)
&g

[FFmpeg-devel] [PATCH 1/5] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-02-09 Thread Alan Kelly

Make the code more readable and follow the style guide.
---
 libswscale/utils.c | 64 +++---
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index c5ea8853d5..1d919e863a 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,39 +278,49 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
+   int filterSize, int16_t *filter,
+   int dstW)
+{
 #if ARCH_X86_64
-int i, j, k, l;
+int i, j, k;
 int cpu_flags = av_get_cpu_flags();
+// avx2 hscale filter processes 16 pixel blocks.
+if (!filter || dstW % 16 != 0)
+return;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
-if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if (dstW % 16 == 0){
-if (filter != NULL){
-for (i = 0; i < dstW; i += 8){
-FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
-FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
-}
-if (filterSize > 4){
-int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
-memcpy(tmp2, filter, dstW * filterSize * 2);
-for (i = 0; i < dstW; i += 16){//pixel
-for (k = 0; k < filterSize / 4; ++k){//fcoeff
-for (j = 0; j < 16; ++j){//inner pixel
-for (l = 0; l < 4; ++l){//coeff
-int from = i * filterSize + j * 
filterSize + k * 4 + l;
-int to = (i) * filterSize + j * 4 + l 
+ k * 64;
-filter[to] = tmp2[from];
-}
-}
-}
-}
-av_free(tmp2);
-}
-}
-}
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+   int16_t *filterCopy = NULL;
+   if (filterSize > 4) {
+   if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
+   return;
+   memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t));
+   }
+   // Do not swap filterPos for pixels which won't be processed by
+   // the main loop.
+   for (i = 0; i + 8 <= dstW; i += 8) {
+   FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
+   FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   }
+   if (filterSize > 4) {
+   // 16 pixels are processed at a time.
+   for (i = 0; i + 16 <= dstW; i += 16) {
+   // 4 filter coeffs are processed at a time.
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < 16; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 16;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
+   }
+   if (filterCopy)
+   av_free(filterCopy);
 }
 }
 #endif
+return;
 }
 
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
-- 
2.35.0.263.gb82422642f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/5] libswscale: Avx2 hscale can process inputs of any size.

2022-02-09 Thread Alan Kelly

The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
---
 libswscale/x86/scale_avx2.asm | 48 +--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..dc42abb100 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 mova m14, [four]
 shr fltsized, 2
 %endif
+cmp wq, 16
+jl .tail_loop
+mov countq, 0x10
 .loop:
 movu m1, [fltposq]
 movu m2, [fltposq+32]
@@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 vpsrad  m6, 7
 vpackssdw m5, m5, m6
 vpermd m5, m15, m5
-vmovdqu [dstq + countq * 2], m5
+vmovdqu [dstq], m5
+add dstq, 0x20
 add fltposq, 0x40
 add countq, 0x10
 cmp countq, wq
-jl .loop
+jle .loop
+
+sub countq, 0x10
+cmp countq, wq
+jge .end
+
+.tail_loop:
+movu xm1, [fltposq]
+%ifidn %1, X4
+pxor xm9, xm9
+pxor xm10, xm10
+xor innerq, innerq
+.tail_innerloop:
+%endif
+vpcmpeqd  xm13, xm13
+vpgatherdd xm3,[srcmemq + xm1], xm13
+vpunpcklbw xm5, xm3, xm0
+vpunpckhbw xm6, xm3, xm0
+vpmaddwd xm5, xm5, [filterq]
+vpmaddwd xm6, xm6, [filterq + 16]
+add filterq, 0x20
+%ifidn %1, X4
+paddd xm9, xm5
+paddd xm10, xm6
+paddd xm1, xm14
+add innerq, 1
+cmp innerq, fltsizeq
+jl .tail_innerloop
+vphaddd xm5, xm9, xm10
+%else
+vphaddd xm5, xm5, xm6
+%endif
+vpsrad  xm5, 7
+vpackssdw xm5, xm5, xm5
+vmovq [dstq], xm5
+add dstq, 0x8
+add fltposq, 0x10
+add countq, 0x4
+cmp countq, wq
+jl .tail_loop
+.end:
 REP_RET
 %endmacro
 
-- 
2.35.0.263.gb82422642f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-02-09 Thread Alan Kelly

ff_shuffle_filter_coefficients shuffles the tail as required.
---
 libswscale/utils.c   | 19 ---
 libswscale/x86/swscale.c |  6 ++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 1d919e863a..31c365fcee 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -285,8 +285,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 #if ARCH_X86_64
 int i, j, k;
 int cpu_flags = av_get_cpu_flags();
-// avx2 hscale filter processes 16 pixel blocks.
-if (!filter || dstW % 16 != 0)
+if (!filter)
 return;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
@@ -298,9 +297,11 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
// Do not swap filterPos for pixels which won't be processed by
// the main loop.
-   for (i = 0; i + 8 <= dstW; i += 8) {
+   for (i = 0; i + 16 <= dstW; i += 16) {
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
+   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
}
if (filterSize > 4) {
// 16 pixels are processed at a time.
@@ -314,6 +315,18 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
}
}
+   // 4 pixels are processed at a time in the tail.
+   for (; i < dstW; i += 4) {
+   // 4 filter coeffs are processed at a time.
+   int rem = dstW - i >= 4 ? 4 : dstW - i;
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < rem; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 4;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
}
if (filterCopy)
av_free(filterCopy);
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 73869355b8..76f5a70fc5 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -691,10 +691,8 @@ switch(c->dstBpc){ \
 
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
-if (c->chrDstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if (c->dstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
 }
 }
 
-- 
2.35.0.263.gb82422642f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 4/5] libswscale: Propagate error codes from ff_shuffle_filter_coefficients

2022-02-09 Thread Alan Kelly

---
 libswscale/swscale_internal.h |  2 +-
 libswscale/utils.c| 14 --
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 3a78d95ba6..26d28d42e6 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 #define MAX_LINES_AHEAD 4
 
 //shuffle filter and filterPos for hyScale and hcScale filters in avx2
-void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
+int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index 31c365fcee..1f8705a417 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
int filterSize, int16_t *filter,
int dstW)
 {
@@ -286,13 +286,13 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 int i, j, k;
 int cpu_flags = av_get_cpu_flags();
 if (!filter)
-return;
+return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
int16_t *filterCopy = NULL;
if (filterSize > 4) {
if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
-   return;
+   return AVERROR(ENOMEM);
memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t));
}
// Do not swap filterPos for pixels which won't be processed by
@@ -333,7 +333,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 }
 }
 #endif
-return;
+return 0;
 }
 
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
@@ -1859,7 +1859,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
+if ((ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW)) < 0)
+goto nomem;
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1869,7 +1870,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
+if ((ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW)) < 0)
+goto nomem;
 }
 } // initialize horizontal stuff
 
-- 
2.35.0.263.gb82422642f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.

2022-02-09 Thread Alan Kelly

This is done in ff_shuffle_filter_coefficients.
---
 tests/checkasm/sw_scale.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 3c0a083b42..e7f916d3a8 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -168,8 +168,6 @@ static void check_hscale(void)
   const uint8_t *src, const int16_t *filter,
   const int32_t *filterPos, int filterSize);
 
-int cpu_flags = av_get_cpu_flags();
-
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
@@ -215,10 +213,10 @@ static void check_hscale(void)
 
 filter[SRC_PIXELS * width + i] = rnd();
 }
+
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.35.0.263.gb82422642f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 1/4] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-02-09 Thread Alan Kelly

Hi Michael,

Thanks for your feedback. I have updated the patches and split this patch
into two, one with cosmetic fixes and one propagating the errors. Since
there is now an extra patch in the set and the commit messages have
changed, new threads have been started.

Alan

On Thu, Feb 3, 2022 at 3:11 PM Michael Niedermayer 
wrote:

> On Mon, Jan 10, 2022 at 03:58:33PM +0100, Alan Kelly wrote:
> > Make the code more readable, follow the style guide and propagate memory
> > allocation errors.
>
> Cosmetics and bugfixes should not be in the same patch
>
>
> > ---
> >  libswscale/swscale_internal.h |  2 +-
> >  libswscale/utils.c| 68 ---
> >  2 files changed, 40 insertions(+), 30 deletions(-)
> >
> > diff --git a/libswscale/swscale_internal.h
> b/libswscale/swscale_internal.h
> > index 3a78d95ba6..26d28d42e6 100644
> > --- a/libswscale/swscale_internal.h
> > +++ b/libswscale/swscale_internal.h
> > @@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr,
> int threadnr,
> >  #define MAX_LINES_AHEAD 4
> >
> >  //shuffle filter and filterPos for hyScale and hcScale filters in avx2
> > -void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
> > +int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int
> filterSize, int16_t *filter, int dstW);
> >  #endif /* SWSCALE_SWSCALE_INTERNAL_H */
> > diff --git a/libswscale/utils.c b/libswscale/utils.c
> > index c5ea8853d5..52f07e1661 100644
> > --- a/libswscale/utils.c
> > +++ b/libswscale/utils.c
> > @@ -278,39 +278,47 @@ static const FormatEntry format_entries[] = {
> >  [AV_PIX_FMT_P416LE]  = { 1, 1 },
> >  };
> >
> > -void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int
> filterSize, int16_t *filter, int dstW){
> > +int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
> > +   int filterSize, int16_t *filter,
> > +   int dstW)
> > +{
> >  #if ARCH_X86_64
>
> > -int i, j, k, l;
> > +int i = 0, j = 0, k = 0;
>
> why?
> they are set when used if iam not mistaken
>
>
> >  int cpu_flags = av_get_cpu_flags();
>
> > +if (!filter || dstW % 16 != 0) return 0;
>
> please add \n also a comment what the dstW & 16 case exactly does and why
>
>
> [...]
> >  int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
> > @@ -1836,7 +1844,8 @@ av_cold int sws_init_context(SwsContext *c,
> SwsFilter *srcFilter,
> > get_local_pos(c, 0, 0, 0),
> > get_local_pos(c, 0, 0, 0))) < 0)
> >  goto fail;
> > -ff_shuffle_filter_coefficients(c, c->hLumFilterPos,
> c->hLumFilterSize, c->hLumFilter, dstW);
> > +if ((ret = ff_shuffle_filter_coefficients(c,
> c->hLumFilterPos, c->hLumFilterSize, c->hLumFilter, dstW)) != 0)
> > +goto nomem;
>
> This is confusing as ret is never used, also error codes are <0
>
> thx
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Those who are best at talking, realize last or never when they are wrong.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 1/5] libswscale: Check and propagate memory allocation errors from ff_shuffle_filter_coefficients.

2022-02-17 Thread Alan Kelly

---
 libswscale/swscale_internal.h |  2 +-
 libswscale/utils.c| 11 ---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/libswscale/swscale_internal.h b/libswscale/swscale_internal.h
index 3a78d95ba6..26d28d42e6 100644
--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -1144,5 +1144,5 @@ void ff_sws_slice_worker(void *priv, int jobnr, int 
threadnr,
 #define MAX_LINES_AHEAD 4
 
 //shuffle filter and filterPos for hyScale and hcScale filters in avx2
-void ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
+int ff_shuffle_filter_coefficients(SwsContext *c, int* filterPos, int 
filterSize, int16_t *filter, int dstW);
 #endif /* SWSCALE_SWSCALE_INTERNAL_H */
diff --git a/libswscale/utils.c b/libswscale/utils.c
index c5ea8853d5..344c87dfdf 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,7 +278,7 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-void ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
 #if ARCH_X86_64
 int i, j, k, l;
 int cpu_flags = av_get_cpu_flags();
@@ -292,6 +292,8 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 }
 if (filterSize > 4){
 int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
+if (!tmp2)
+return AVERROR(ENOMEM);
 memcpy(tmp2, filter, dstW * filterSize * 2);
 for (i = 0; i < dstW; i += 16){//pixel
 for (k = 0; k < filterSize / 4; ++k){//fcoeff
@@ -310,6 +312,7 @@ void ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos, int filterSiz
 }
 }
 }
+return 0;
 #endif
 }
 
@@ -1836,7 +1839,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW);
+if (ff_shuffle_filter_coefficients(c, c->hLumFilterPos, 
c->hLumFilterSize, c->hLumFilter, dstW) < 0)
+goto nomem;
 if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
@@ -1846,7 +1850,8 @@ av_cold int sws_init_context(SwsContext *c, SwsFilter 
*srcFilter,
get_local_pos(c, c->chrSrcHSubSample, 
c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, 
c->dst_h_chr_pos, 0))) < 0)
 goto fail;
-ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW);
+if (ff_shuffle_filter_coefficients(c, c->hChrFilterPos, 
c->hChrFilterSize, c->hChrFilter, c->chrDstW) < 0)
+goto nomem;
 }
 } // initialize horizontal stuff
 
-- 
2.35.1.265.g69c8d7142f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 2/5] libswscale: Re-factor ff_shuffle_filter_coefficients.

2022-02-17 Thread Alan Kelly

Make the code more readable and follow the style guide.
---
 libswscale/utils.c | 66 +-
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 344c87dfdf..7c8e1bbdde 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -278,42 +278,48 @@ static const FormatEntry format_entries[] = {
 [AV_PIX_FMT_P416LE]  = { 1, 1 },
 };
 
-int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos, int 
filterSize, int16_t *filter, int dstW){
+int ff_shuffle_filter_coefficients(SwsContext *c, int *filterPos,
+   int filterSize, int16_t *filter,
+   int dstW)
+{
 #if ARCH_X86_64
-int i, j, k, l;
+int i, j, k;
 int cpu_flags = av_get_cpu_flags();
+// avx2 hscale filter processes 16 pixel blocks.
+if (!filter || dstW % 16 != 0)
+return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
-if ((c->srcBpc == 8) && (c->dstBpc <= 14)){
-if (dstW % 16 == 0){
-if (filter != NULL){
-for (i = 0; i < dstW; i += 8){
-FFSWAP(int, filterPos[i + 2], filterPos[i+4]);
-FFSWAP(int, filterPos[i + 3], filterPos[i+5]);
-}
-if (filterSize > 4){
-int16_t *tmp2 = av_malloc(dstW * filterSize * 2);
-if (!tmp2)
-return AVERROR(ENOMEM);
-memcpy(tmp2, filter, dstW * filterSize * 2);
-for (i = 0; i < dstW; i += 16){//pixel
-for (k = 0; k < filterSize / 4; ++k){//fcoeff
-for (j = 0; j < 16; ++j){//inner pixel
-for (l = 0; l < 4; ++l){//coeff
-int from = i * filterSize + j * 
filterSize + k * 4 + l;
-int to = (i) * filterSize + j * 4 + l 
+ k * 64;
-filter[to] = tmp2[from];
-}
-}
-}
-}
-av_free(tmp2);
-}
-}
-}
+if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
+   int16_t *filterCopy = NULL;
+   if (filterSize > 4) {
+   if (!FF_ALLOC_TYPED_ARRAY(filterCopy, dstW * filterSize))
+   return AVERROR(ENOMEM);
+   memcpy(filterCopy, filter, dstW * filterSize * sizeof(int16_t));
+   }
+   // Do not swap filterPos for pixels which won't be processed by
+   // the main loop.
+   for (i = 0; i + 8 <= dstW; i += 8) {
+   FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
+   FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   }
+   if (filterSize > 4) {
+   // 16 pixels are processed at a time.
+   for (i = 0; i + 16 <= dstW; i += 16) {
+   // 4 filter coeffs are processed at a time.
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < 16; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 16;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
+   }
+   av_free(filterCopy);
 }
 }
-return 0;
 #endif
+return 0;
 }
 
 int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
-- 
2.35.1.265.g69c8d7142f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.

2022-02-17 Thread Alan Kelly

The main loop processes blocks of 16 pixels. The tail processes blocks
of size 4.
---
 libswscale/x86/scale_avx2.asm | 48 +--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
index 20acdbd633..dc42abb100 100644
--- a/libswscale/x86/scale_avx2.asm
+++ b/libswscale/x86/scale_avx2.asm
@@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 mova m14, [four]
 shr fltsized, 2
 %endif
+cmp wq, 16
+jl .tail_loop
+mov countq, 0x10
 .loop:
 movu m1, [fltposq]
 movu m2, [fltposq+32]
@@ -97,11 +100,52 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem, 
filter, fltpos, fltsize,
 vpsrad  m6, 7
 vpackssdw m5, m5, m6
 vpermd m5, m15, m5
-vmovdqu [dstq + countq * 2], m5
+vmovdqu [dstq], m5
+add dstq, 0x20
 add fltposq, 0x40
 add countq, 0x10
 cmp countq, wq
-jl .loop
+jle .loop
+
+sub countq, 0x10
+cmp countq, wq
+jge .end
+
+.tail_loop:
+movu xm1, [fltposq]
+%ifidn %1, X4
+pxor xm9, xm9
+pxor xm10, xm10
+xor innerq, innerq
+.tail_innerloop:
+%endif
+vpcmpeqd  xm13, xm13
+vpgatherdd xm3,[srcmemq + xm1], xm13
+vpunpcklbw xm5, xm3, xm0
+vpunpckhbw xm6, xm3, xm0
+vpmaddwd xm5, xm5, [filterq]
+vpmaddwd xm6, xm6, [filterq + 16]
+add filterq, 0x20
+%ifidn %1, X4
+paddd xm9, xm5
+paddd xm10, xm6
+paddd xm1, xm14
+add innerq, 1
+cmp innerq, fltsizeq
+jl .tail_innerloop
+vphaddd xm5, xm9, xm10
+%else
+vphaddd xm5, xm5, xm6
+%endif
+vpsrad  xm5, 7
+vpackssdw xm5, xm5, xm5
+vmovq [dstq], xm5
+add dstq, 0x8
+add fltposq, 0x10
+add countq, 0x4
+cmp countq, wq
+jl .tail_loop
+.end:
 REP_RET
 %endmacro
 
-- 
2.35.1.265.g69c8d7142f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-02-17 Thread Alan Kelly

ff_shuffle_filter_coefficients shuffles the tail as required.
---
 libswscale/utils.c   | 19 ---
 libswscale/x86/swscale.c |  6 ++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index 7c8e1bbdde..d818c9ce55 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 #if ARCH_X86_64
 int i, j, k;
 int cpu_flags = av_get_cpu_flags();
-// avx2 hscale filter processes 16 pixel blocks.
-if (!filter || dstW % 16 != 0)
+if (!filter)
 return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
@@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
// Do not swap filterPos for pixels which won't be processed by
// the main loop.
-   for (i = 0; i + 8 <= dstW; i += 8) {
+   for (i = 0; i + 16 <= dstW; i += 16) {
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
+   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
}
if (filterSize > 4) {
// 16 pixels are processed at a time.
@@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
}
}
+   // 4 pixels are processed at a time in the tail.
+   for (; i < dstW; i += 4) {
+   // 4 filter coeffs are processed at a time.
+   int rem = dstW - i >= 4 ? 4 : dstW - i;
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < rem; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 4;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
}
av_free(filterCopy);
 }
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 73869355b8..76f5a70fc5 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -691,10 +691,8 @@ switch(c->dstBpc){ \
 
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
-if (c->chrDstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if (c->dstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
 }
 }
 
-- 
2.35.1.265.g69c8d7142f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.

2022-02-17 Thread Alan Kelly

This is done in ff_shuffle_filter_coefficients.
---
 tests/checkasm/sw_scale.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 3c0a083b42..4c57b6a372 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -168,8 +168,6 @@ static void check_hscale(void)
   const uint8_t *src, const int16_t *filter,
   const int32_t *filterPos, int filterSize);
 
-int cpu_flags = av_get_cpu_flags();
-
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
@@ -217,8 +215,7 @@ static void check_hscale(void)
 }
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d_width%d", 
ctx->srcBpc, ctx->dstBpc + 1, width)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.35.1.265.g69c8d7142f-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.

2022-03-07 Thread Alan Kelly

Hi Michael,

Thanks for reviewing the first two parts of this patchset.

Is there anybody interested in reviewing this part?

Thanks,

Alan

On Thu, Feb 17, 2022 at 5:21 PM Michael Niedermayer 
wrote:

> On Thu, Feb 17, 2022 at 11:04:04AM +0100, Alan Kelly wrote:
> > The main loop processes blocks of 16 pixels. The tail processes blocks
> > of size 4.
> > ---
> >  libswscale/x86/scale_avx2.asm | 48 +--
> >  1 file changed, 46 insertions(+), 2 deletions(-)
>
> ill wait a few days on this, there are people here who know avx2 better
> than i do
> its a while since i wrote x86 SIMD.
> but if noone else reviews this then ill do
>
> thx
>
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> "You are 36 times more likely to die in a bathtub than at the hands of a
> terrorist. Also, you are 2.5 times more likely to become a president and
> 2 times more likely to become an astronaut, than to die in a terrorist
> attack." -- Thoughty2
>
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 1/3] swscale/x86/swscale: Process yuv2yuvX tails using next largest register size

2023-07-14 Thread Alan Kelly

---
 libswscale/x86/swscale.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index ff16398988..8c67bf4fab 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -194,7 +194,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 return; \
 }
 
-#define YUV2YUVX_FUNC(opt, step)  \
+#define YUV2YUVX_FUNC(opt, step, tail)  \
 void ff_yuv2yuvX_ ##opt(const int16_t *filter, int filterSize, int srcOffset, \
uint8_t *dest, int dstW,  \
const uint8_t *dither, int offset); \
@@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 if(pixelsProcessed > 0) \
 ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
-  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
+  yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
 } \
 return; \
 }
@@ -220,10 +220,10 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 YUV2YUVX_FUNC_MMX(mmxext, 16)
 #endif
 #if HAVE_SSE3_EXTERNAL
-YUV2YUVX_FUNC(sse3, 32)
+YUV2YUVX_FUNC(sse3, 32, mmxext)
 #endif
 #if HAVE_AVX2_EXTERNAL
-YUV2YUVX_FUNC(avx2, 64)
+YUV2YUVX_FUNC(avx2, 64, sse3)
 #endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
-- 
2.41.0.255.g8b1d071c50-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512

2023-07-14 Thread Alan Kelly

---
 libswscale/x86/swscale.c|  7 +++
 libswscale/x86/yuv2yuvX.asm | 19 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 8c67bf4fab..52423a1199 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32, mmxext)
 #if HAVE_AVX2_EXTERNAL
 YUV2YUVX_FUNC(avx2, 64, sse3)
 #endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+YUV2YUVX_FUNC(avx512, 128, avx2)
+#endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
@@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 #if HAVE_AVX2_EXTERNAL
 if (EXTERNAL_AVX2_FAST(cpu_flags))
 c->yuv2planeX = yuv2yuvX_avx2;
+#endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+if (EXTERNAL_AVX512(cpu_flags))
+c->yuv2planeX = yuv2yuvX_avx512;
 #endif
 }
 #if ARCH_X86_32 && !HAVE_ALIGNED_STACK
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 369c850674..57bfa09d66 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -22,6 +22,10 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA 64
+
+permutation: dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 
 ;-
@@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+
+%if cpuflag(avx512)
+mova m15, [permutation]
+%endif
 cmp  offsetd, 0
 jz   .offset
 
@@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 packuswb m6, m6, m1
 %endif
 mov  srcq, [filterq]
-%if cpuflag(avx2)
+%if cpuflag(avx512)
+vpermt2q m3, m15, m3
+vpermt2q m6, m15, m6
+%elif cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
@@ -131,4 +142,10 @@ YUV2YUVX_FUNC
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 YUV2YUVX_FUNC
+%if HAVE_AVX512_EXTERNAL
+%if ARCH_X86_64
+INIT_ZMM avx512
+YUV2YUVX_FUNC
+%endif
+%endif
 %endif
-- 
2.41.0.255.g8b1d071c50-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.

2023-07-14 Thread Alan Kelly

---
 libswscale/x86/swscale.c| 11 ---
 libswscale/x86/yuv2yuvX.asm | 12 ++--
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 52423a1199..71434f58d3 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-int remainder = (dstW % step); \
-int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-if(pixelsProcessed > 0) \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
-if(remainder > 0){ \
-  yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
-} \
+if (dstW >= step) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+else \
+yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); 
\
 return; \
 }
 
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 57bfa09d66..ad0e8bd448 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+mov  ditherq, dstWq
+sub  dstWq, mmsize * unroll
 
 %if cpuflag(avx512)
 mova m15, [permutation]
@@ -131,8 +133,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
-jb  .outerloop
-RET
+jb   .outerloop
+
+mov  dstWq, offsetq
+mov  offsetq, ditherq
+sub  offsetq, mmsize * unroll
+cmp  dstWq, ditherq
+jb   .outerloop
+REP_RET
 %endmacro
 
 INIT_MMX mmxext
-- 
2.41.0.255.g8b1d071c50-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512

2023-07-17 Thread Alan Kelly

Happy to add the check.

Thanks,
Alan

On Fri, Jul 14, 2023 at 4:59 PM James Almer  wrote:

> On 7/14/2023 11:57 AM, Kieran Kunhya wrote:
> > On Fri, 14 Jul 2023 at 14:03, James Almer  wrote:
> >
> >> On 7/14/2023 9:59 AM, Kieran Kunhya wrote:
>  +#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
>  +if (EXTERNAL_AVX512(cpu_flags))
>  +c->yuv2planeX = yuv2yuvX_avx512;
> #endif
> 
> >>>
> >>>You want EXTERNAL_AVX512ICL here.
> >>
> >> vpermt2q with zmm registers is avx512f and not any of the extensions, so
> >> that check is fine.
> >>
> >
> > We still support Skylake and we don't want downclocking on that platform.
> > At least that was my understanding of the intention of AVX512 vs
> AVX512ICL.
> > It appears I'm the only one following this convention though.
>
> Ah, no opinion in that regard. I was following the use of the checks in
> the strict technical sense of instruction availability.
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH 2/3] swscale/x86/yuv2yuvX: Add yuv2yuvX avx512

2023-07-17 Thread Alan Kelly

---
 Checks for EXTERNAL_AVX512ICL to prevent downclocking on Skylake
 libswscale/x86/swscale.c|  7 +++
 libswscale/x86/yuv2yuvX.asm | 19 ++-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 8c67bf4fab..600c7d6c91 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -225,6 +225,9 @@ YUV2YUVX_FUNC(sse3, 32, mmxext)
 #if HAVE_AVX2_EXTERNAL
 YUV2YUVX_FUNC(avx2, 64, sse3)
 #endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+YUV2YUVX_FUNC(avx512, 128, avx2)
+#endif
 
 #define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
 void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
@@ -467,6 +470,10 @@ av_cold void ff_sws_init_swscale_x86(SwsContext *c)
 #if HAVE_AVX2_EXTERNAL
 if (EXTERNAL_AVX2_FAST(cpu_flags))
 c->yuv2planeX = yuv2yuvX_avx2;
+#endif
+#if ARCH_X86_64 && HAVE_AVX512_EXTERNAL
+if (EXTERNAL_AVX512ICL(cpu_flags))
+c->yuv2planeX = yuv2yuvX_avx512;
 #endif
 }
 #if ARCH_X86_32 && !HAVE_ALIGNED_STACK
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 369c850674..57bfa09d66 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -22,6 +22,10 @@
 
 %include "libavutil/x86/x86util.asm"
 
+SECTION_RODATA 64
+
+permutation: dq 0, 2, 4, 6, 1, 3, 5, 7
+
 SECTION .text
 
 ;-
@@ -50,6 +54,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+
+%if cpuflag(avx512)
+mova m15, [permutation]
+%endif
 cmp  offsetd, 0
 jz   .offset
 
@@ -109,7 +117,10 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 packuswb m6, m6, m1
 %endif
 mov  srcq, [filterq]
-%if cpuflag(avx2)
+%if cpuflag(avx512)
+vpermt2q m3, m15, m3
+vpermt2q m6, m15, m6
+%elif cpuflag(avx2)
 vpermq   m3, m3, 216
 vpermq   m6, m6, 216
 %endif
@@ -131,4 +142,10 @@ YUV2YUVX_FUNC
 %if HAVE_AVX2_EXTERNAL
 INIT_YMM avx2
 YUV2YUVX_FUNC
+%if HAVE_AVX512_EXTERNAL
+%if ARCH_X86_64
+INIT_ZMM avx512
+YUV2YUVX_FUNC
+%endif
+%endif
 %endif
-- 
2.41.0.255.g8b1d071c50-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.

2023-07-17 Thread Alan Kelly

On Sat, Jul 15, 2023 at 10:40 PM Michael Niedermayer 
wrote:

> On Fri, Jul 14, 2023 at 12:08:46PM +0200, Alan Kelly wrote:
> > ---
> >  libswscale/x86/swscale.c| 11 ---
> >  libswscale/x86/yuv2yuvX.asm | 12 ++--
> >  2 files changed, 14 insertions(+), 9 deletions(-)
>
> seems to segfault with
>
> ./ffmpeg_g -i mm-short.mpg -an -vcodec snow -t 0.2 -bitexact -pix_fmt
> yuv410p -s 199x199 -vstrict -2 -y  snow3914-199-410.avi
>
> Thread 79 "ffmpeg_g" received signal SIGSEGV, Segmentation fault.
> [Switching to Thread 0x7fffaffef700 (LWP 23533)]
> 0x5658a0f6 in ff_yuv2yuvX_sse3 ()
> (gdb) bt
> #0  0x5658a0f6 in ff_yuv2yuvX_sse3 ()
> #1  0x56585bc6 in chr_planar_vscale ()
> #2  0x565817d1 in scale_internal ()
> #3  0x565827d9 in ff_sws_slice_worker ()
> #4  0x5662b06e in thread_worker ()
> #5  0x775fc6db in start_thread (arg=0x7fffaffef700) at
> pthread_create.c:463
> #6  0x7fffed12861f in clone () at
> ../sysdeps/unix/sysv/linux/x86_64/clone.S:95
> (gdb) disassemble $rip-32,$rip+32
> Dump of assembler code from 0x5658a0d6 to 0x5658a116:
>0x5658a0d6 :std
>0x5658a0d7 :fldenv 0xf(%rsi)
>0x5658a0da :outsl  %ds:(%rsi),(%dx)
>0x5658a0db :sti
>0x5658a0dc :psraw  $0x4,%xmm7
>0x5658a0e1 :movdqa %xmm7,%xmm4
>0x5658a0e5 :   movdqa %xmm7,%xmm3
>0x5658a0e9 :   movdqa %xmm7,%xmm6
>0x5658a0ed :   movdqa %xmm7,%xmm1
>0x5658a0f1 :   movddup 0x8(%rsi),%xmm0
> => 0x5658a0f6 :   movdqa (%rdx,%rax,2),%xmm2
>0x5658a0fb :   pmulhw %xmm0,%xmm2
>0x5658a0ff :   movdqa
> 0x10(%rdx,%rax,2),%xmm5
>0x5658a105 :   pmulhw %xmm0,%xmm5
>0x5658a109 :   paddw  %xmm2,%xmm3
>0x5658a10d :   paddw  %xmm5,%xmm4
>0x5658a111 :   movdqa
> 0x20(%rdx,%rax,2),%xmm2
> End of assembler dump.
> (gdb) info all-registers
> rax0x12 18
> rbx0x32 50
> rcx0x57915480   93825029723264
> rdx0x57687680   93825027044992
> rsi0x5758   93825026909784
> rdi0x5758   93825026909784
> rbp0x5765b880   0x5765b880
> rsp0x7fffaffee7a8   0x7fffaffee7a8
> r8 0x20 32
> r9 0x32 50
> r100x56589860   93825009227872
> r110x576f9dc0   93825027513792
> r120x5763b280   93825026732672
> r130x5758   93825026909784
> r140x577b5800   93825028282368
> r150x57622640   93825026631232
> rip0x5658a0f6   0x5658a0f6 
> eflags 0x10297  [ CF PF AF SF IF RF ]
> cs 0x33 51
> ss 0x2b 43
> ds 0x0  0
> es 0x0  0
> fs 0x0  0
> gs 0x0  0
> st00(raw 0x)
> st10(raw 0x)
> st20(raw 0x)
> st30(raw 0x)
> st40(raw 0x)
> st50(raw 0x)
> st60(raw 0x)
> st70(raw 0x)
> fctrl  0x   65535
> fstat  0x   65535
> ftag   0x   43690
> fiseg  0x1  1
> fioff  0x0  0
> foseg  0x5646   22086
> fooff  0xa  10
> fop0x7ff2047
> mxcsr  0x1fa8   [ OE PE IM DM ZM OM UM PM ]
>
>
> >
> > diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> > index 52423a1199..71434f58d3 100644
> > --- a/libswscale/x86/swscale.c
> > +++ b/libswscale/x86/swscale.c
> > @@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter,
> int filterSize, \
> > const int16_t **src, uint8_t *dest, int
> dstW, \
> > const uint8_t *dither, int offset) \
> >  { \
> > -int remainder = (dstW % step); \
> > -int pixelsProcessed = dstW - remainder; \
> >  if(((uintptr_t)dest) & 15){ \
> >  yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither,
> offset); \
> >  return; \
> >  } \
> > -if(pixelsProcessed > 0) \
> > -ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset,
> pixelsProcessed + offset, dither, offset); \
&g

[FFmpeg-devel] [PATCH 3/3] swscale/x86/yuv2yuvX: Process tails by jumping back into the main loop.

2023-07-17 Thread Alan Kelly

---
 libswscale/x86/swscale.c| 11 ---
 libswscale/x86/yuv2yuvX.asm | 24 ++--
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 600c7d6c91..6980002e9e 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -202,17 +202,14 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset) \
 { \
-int remainder = (dstW % step); \
-int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
 yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
-if(pixelsProcessed > 0) \
-ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
-if(remainder > 0){ \
-  yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); \
-} \
+if (dstW >= step) \
+ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, dstW + 
offset, dither, offset); \
+else \
+yuv2yuvX_ ##tail(filter, filterSize, src, dest, dstW, dither, offset); 
\
 return; \
 }
 
diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index 57bfa09d66..03bfd6ad1d 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -54,6 +54,8 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 movq xm3, [ditherq]
 %endif ; avx2
+mov  ditherq, dstWq
+sub  dstWq, mmsize * unroll
 
 %if cpuflag(avx512)
 mova m15, [permutation]
@@ -92,13 +94,17 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 %else
 mova m0, [filterSizeq + 8]
 %endif
-pmulhw   m2, m0, [srcq + offsetq * 2]
-pmulhw   m5, m0, [srcq + offsetq * 2 + mmsize]
+movu m2, [srcq + offsetq * 2]
+movu m5, [srcq + offsetq * 2 + mmsize]
+pmulhw   m2, m0, m2
+pmulhw   m5, m0, m5
 paddwm3, m3, m2
 paddwm4, m4, m5
 %if cpuflag(sse3)
-pmulhw   m2, m0, [srcq + offsetq * 2 + 2 * mmsize]
-pmulhw   m5, m0, [srcq + offsetq * 2 + 3 * mmsize]
+movu m2, [srcq + offsetq * 2 + 2 * mmsize]
+movu m5, [srcq + offsetq * 2 + 3 * mmsize]
+pmulhw   m2, m0, m2
+pmulhw   m5, m0, m5
 paddwm6, m6, m2
 paddwm1, m1, m5
 %endif
@@ -131,8 +137,14 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 add  offsetq, mmsize * unroll
 mov  filterSizeq, filterq
 cmp  offsetq, dstWq
-jb  .outerloop
-RET
+jb   .outerloop
+
+mov  dstWq, offsetq
+mov  offsetq, ditherq
+sub  offsetq, mmsize * unroll
+cmp  dstWq, ditherq
+jb   .outerloop
+REP_RET
 %endmacro
 
 INIT_MMX mmxext
-- 
2.41.0.255.g8b1d071c50-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 3/5] libswscale: Avx2 hscale can process inputs of any size.

2022-07-13 Thread Alan Kelly

Hi,

Are there any further comments on this patch or can it be committed?

Thanks,

Alan

On Tue, Apr 26, 2022 at 10:00 AM Alan Kelly  wrote:

> The main loop processes blocks of 16 pixels. The tail processes blocks
> of size 4.
> ---
>  libswscale/x86/scale_avx2.asm | 44 ++-
>  1 file changed, 43 insertions(+), 1 deletion(-)
>
> diff --git a/libswscale/x86/scale_avx2.asm b/libswscale/x86/scale_avx2.asm
> index 20acdbd633..7657b2825f 100644
> --- a/libswscale/x86/scale_avx2.asm
> +++ b/libswscale/x86/scale_avx2.asm
> @@ -53,6 +53,9 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w, srcmem,
> filter, fltpos, fltsize,
>  mova m14, [four]
>  shr fltsized, 2
>  %endif
> +cmp wq, 16
> +jl .tail_loop
> +sub wq, 0x10
>  .loop:
>  movu m1, [fltposq]
>  movu m2, [fltposq+32]
> @@ -101,7 +104,46 @@ cglobal hscale8to15_%1, 7, 9, 16, pos0, dst, w,
> srcmem, filter, fltpos, fltsize,
>  add fltposq, 0x40
>  add countq, 0x10
>  cmp countq, wq
> -jl .loop
> +jle .loop
> +
> +add wq, 0x10
> +cmp countq, wq
> +jge .end
> +
> +.tail_loop:
> +movu xm1, [fltposq]
> +%ifidn %1, X4
> +pxor xm9, xm9
> +pxor xm10, xm10
> +xor innerq, innerq
> +.tail_innerloop:
> +%endif
> +vpcmpeqd  xm13, xm13
> +vpgatherdd xm3,[srcmemq + xm1], xm13
> +vpunpcklbw xm5, xm3, xm0
> +vpunpckhbw xm6, xm3, xm0
> +vpmaddwd xm5, xm5, [filterq]
> +vpmaddwd xm6, xm6, [filterq + 16]
> +add filterq, 0x20
> +%ifidn %1, X4
> +paddd xm9, xm5
> +paddd xm10, xm6
> +paddd xm1, xm14
> +add innerq, 1
> +cmp innerq, fltsizeq
> +jl .tail_innerloop
> +vphaddd xm5, xm9, xm10
> +%else
> +vphaddd xm5, xm5, xm6
> +%endif
> +vpsrad  xm5, 7
> +vpackssdw xm5, xm5, xm5
> +vmovq [dstq + countq * 2], xm5
> +add fltposq, 0x10
> +add countq, 0x4
> +cmp countq, wq
> +jl .tail_loop
> +.end:
>  REP_RET
>  %endmacro
>
> --
> 2.36.0.rc2.479.g8af0fa9b8e-goog
>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-07-13 Thread Alan Kelly

Pushing this back up to the top. This is required to enable the previous
patch in this chain. Thanks

On Fri, Apr 22, 2022 at 10:04 AM Alan Kelly  wrote:

> Ping!
>
> On Thu, Feb 17, 2022 at 11:04 AM Alan Kelly  wrote:
>
>> ff_shuffle_filter_coefficients shuffles the tail as required.
>> ---
>>  libswscale/utils.c   | 19 ---
>>  libswscale/x86/swscale.c |  6 ++
>>  2 files changed, 18 insertions(+), 7 deletions(-)
>>
>> diff --git a/libswscale/utils.c b/libswscale/utils.c
>> index 7c8e1bbdde..d818c9ce55 100644
>> --- a/libswscale/utils.c
>> +++ b/libswscale/utils.c
>> @@ -285,8 +285,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int
>> *filterPos,
>>  #if ARCH_X86_64
>>  int i, j, k;
>>  int cpu_flags = av_get_cpu_flags();
>> -// avx2 hscale filter processes 16 pixel blocks.
>> -if (!filter || dstW % 16 != 0)
>> +if (!filter)
>>  return 0;
>>  if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags &
>> AV_CPU_FLAG_SLOW_GATHER)) {
>>  if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
>> @@ -298,9 +297,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c,
>> int *filterPos,
>> }
>> // Do not swap filterPos for pixels which won't be processed
>> by
>> // the main loop.
>> -   for (i = 0; i + 8 <= dstW; i += 8) {
>> +   for (i = 0; i + 16 <= dstW; i += 16) {
>> FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
>> FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
>> +   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
>> +   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
>> }
>> if (filterSize > 4) {
>> // 16 pixels are processed at a time.
>> @@ -314,6 +315,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c,
>> int *filterPos,
>> }
>> }
>> }
>> +   // 4 pixels are processed at a time in the tail.
>> +   for (; i < dstW; i += 4) {
>> +   // 4 filter coeffs are processed at a time.
>> +   int rem = dstW - i >= 4 ? 4 : dstW - i;
>> +   for (k = 0; k + 4 <= filterSize; k += 4) {
>> +   for (j = 0; j < rem; ++j) {
>> +   int from = (i + j) * filterSize + k;
>> +   int to = i * filterSize + j * 4 + k * 4;
>> +   memcpy(&filter[to], &filterCopy[from], 4 *
>> sizeof(int16_t));
>> +   }
>> +   }
>> +   }
>> }
>> av_free(filterCopy);
>>  }
>> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
>> index 73869355b8..76f5a70fc5 100644
>> --- a/libswscale/x86/swscale.c
>> +++ b/libswscale/x86/swscale.c
>> @@ -691,10 +691,8 @@ switch(c->dstBpc){ \
>>
>>  if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags &
>> AV_CPU_FLAG_SLOW_GATHER)) {
>>  if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
>> -if (c->chrDstW % 16 == 0)
>> -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
>> -if (c->dstW % 16 == 0)
>> -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
>> +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
>> +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
>>  }
>>  }
>>
>> --
>> 2.35.1.265.g69c8d7142f-goog
>>
>>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-07-15 Thread Alan Kelly

ff_shuffle_filter_coefficients shuffles the tail as required.
---
 libswscale/utils.c| 19 ---
 libswscale/x86/swscale.c  |  6 ++
 tests/checkasm/sw_scale.c |  2 +-
 3 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/libswscale/utils.c b/libswscale/utils.c
index cb4f5b521c..544b7fee96 100644
--- a/libswscale/utils.c
+++ b/libswscale/utils.c
@@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
 #if ARCH_X86_64
 int i, j, k;
 int cpu_flags = av_get_cpu_flags();
-// avx2 hscale filter processes 16 pixel blocks.
-if (!filter || dstW % 16 != 0)
+if (!filter)
 return 0;
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
@@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
// Do not swap filterPos for pixels which won't be processed by
// the main loop.
-   for (i = 0; i + 8 <= dstW; i += 8) {
+   for (i = 0; i + 16 <= dstW; i += 16) {
FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
+   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
+   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
}
if (filterSize > 4) {
// 16 pixels are processed at a time.
@@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int 
*filterPos,
}
}
}
+   // 4 pixels are processed at a time in the tail.
+   for (; i < dstW; i += 4) {
+   // 4 filter coeffs are processed at a time.
+   int rem = dstW - i >= 4 ? 4 : dstW - i;
+   for (k = 0; k + 4 <= filterSize; k += 4) {
+   for (j = 0; j < rem; ++j) {
+   int from = (i + j) * filterSize + k;
+   int to = i * filterSize + j * 4 + k * 4;
+   memcpy(&filter[to], &filterCopy[from], 4 * 
sizeof(int16_t));
+   }
+   }
+   }
}
av_free(filterCopy);
 }
diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 628f12137c..f628c71bd4 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -626,10 +626,8 @@ switch(c->dstBpc){ \
 
 if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER)) {
 if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
-if (c->chrDstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
-if (c->dstW % 16 == 0)
-ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
+ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
 }
 }
 
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index b643a47c30..798990a6cf 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -223,7 +223,7 @@ static void check_hscale(void)
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
 if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, SRC_PIXELS);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", 
ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.37.0.170.g444d1eabd0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH v2 5/5] checkasm/sw_scale: hscale does not requires cpuflag test.

2022-07-15 Thread Alan Kelly

This is done in ff_shuffle_filter_coefficients.
---
 tests/checkasm/sw_scale.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index 798990a6cf..7be107bef1 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -172,8 +172,6 @@ static void check_hscale(void)
   const uint8_t *src, const int16_t *filter,
   const int32_t *filterPos, int filterSize);
 
-int cpu_flags = av_get_cpu_flags();
-
 ctx = sws_alloc_context();
 if (sws_init_context(ctx, NULL, NULL) < 0)
 fail();
@@ -222,8 +220,7 @@ static void check_hscale(void)
 ctx->dstW = ctx->chrDstW = input_sizes[dstWi];
 ff_sws_init_scale(ctx);
 memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS * 
MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
-if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags & 
AV_CPU_FLAG_SLOW_GATHER))
-ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
+ff_shuffle_filter_coefficients(ctx, filterPosAvx, width, 
filterAvx2, ctx->dstW);
 
 if (check_func(ctx->hcScale, "hscale_%d_to_%d__fs_%d_dstW_%d", 
ctx->srcBpc, ctx->dstBpc + 1, width, ctx->dstW)) {
 memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
-- 
2.37.0.170.g444d1eabd0-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-07-15 Thread Alan Kelly

Hi Michael,

Thanks for looking at this. I fixed the test issue.

Alan

On Fri, Jul 15, 2022 at 4:59 PM Alan Kelly  wrote:

> ff_shuffle_filter_coefficients shuffles the tail as required.
> ---
>  libswscale/utils.c| 19 ---
>  libswscale/x86/swscale.c  |  6 ++
>  tests/checkasm/sw_scale.c |  2 +-
>  3 files changed, 19 insertions(+), 8 deletions(-)
>
> diff --git a/libswscale/utils.c b/libswscale/utils.c
> index cb4f5b521c..544b7fee96 100644
> --- a/libswscale/utils.c
> +++ b/libswscale/utils.c
> @@ -266,8 +266,7 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int
> *filterPos,
>  #if ARCH_X86_64
>  int i, j, k;
>  int cpu_flags = av_get_cpu_flags();
> -// avx2 hscale filter processes 16 pixel blocks.
> -if (!filter || dstW % 16 != 0)
> +if (!filter)
>  return 0;
>  if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags &
> AV_CPU_FLAG_SLOW_GATHER)) {
>  if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
> @@ -279,9 +278,11 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int
> *filterPos,
> }
> // Do not swap filterPos for pixels which won't be processed by
> // the main loop.
> -   for (i = 0; i + 8 <= dstW; i += 8) {
> +   for (i = 0; i + 16 <= dstW; i += 16) {
> FFSWAP(int, filterPos[i + 2], filterPos[i + 4]);
> FFSWAP(int, filterPos[i + 3], filterPos[i + 5]);
> +   FFSWAP(int, filterPos[i + 10], filterPos[i + 12]);
> +   FFSWAP(int, filterPos[i + 11], filterPos[i + 13]);
> }
> if (filterSize > 4) {
> // 16 pixels are processed at a time.
> @@ -295,6 +296,18 @@ int ff_shuffle_filter_coefficients(SwsContext *c, int
> *filterPos,
> }
> }
> }
> +   // 4 pixels are processed at a time in the tail.
> +   for (; i < dstW; i += 4) {
> +   // 4 filter coeffs are processed at a time.
> +   int rem = dstW - i >= 4 ? 4 : dstW - i;
> +   for (k = 0; k + 4 <= filterSize; k += 4) {
> +   for (j = 0; j < rem; ++j) {
> +   int from = (i + j) * filterSize + k;
> +   int to = i * filterSize + j * 4 + k * 4;
> +   memcpy(&filter[to], &filterCopy[from], 4 *
> sizeof(int16_t));
> +   }
> +   }
> +   }
> }
> av_free(filterCopy);
>  }
> diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
> index 628f12137c..f628c71bd4 100644
> --- a/libswscale/x86/swscale.c
> +++ b/libswscale/x86/swscale.c
> @@ -626,10 +626,8 @@ switch(c->dstBpc){ \
>
>  if (EXTERNAL_AVX2_FAST(cpu_flags) && !(cpu_flags &
> AV_CPU_FLAG_SLOW_GATHER)) {
>  if ((c->srcBpc == 8) && (c->dstBpc <= 14)) {
> -if (c->chrDstW % 16 == 0)
> -ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> -if (c->dstW % 16 == 0)
> -ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
> +ASSIGN_AVX2_SCALE_FUNC(c->hcScale, c->hChrFilterSize);
> +ASSIGN_AVX2_SCALE_FUNC(c->hyScale, c->hLumFilterSize);
>  }
>  }
>
> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
> index b643a47c30..798990a6cf 100644
> --- a/tests/checkasm/sw_scale.c
> +++ b/tests/checkasm/sw_scale.c
> @@ -223,7 +223,7 @@ static void check_hscale(void)
>  ff_sws_init_scale(ctx);
>  memcpy(filterAvx2, filter, sizeof(uint16_t) * (SRC_PIXELS
> * MAX_FILTER_WIDTH + MAX_FILTER_WIDTH));
>  if ((cpu_flags & AV_CPU_FLAG_AVX2) && !(cpu_flags &
> AV_CPU_FLAG_SLOW_GATHER))
> -ff_shuffle_filter_coefficients(ctx, filterPosAvx,
> width, filterAvx2, SRC_PIXELS);
> +ff_shuffle_filter_coefficients(ctx, filterPosAvx,
> width, filterAvx2, ctx->dstW);
>
>  if (check_func(ctx->hcScale,
> "hscale_%d_to_%d__fs_%d_dstW_%d", ctx->srcBpc, ctx->dstBpc + 1, width,
> ctx->dstW)) {
>  memset(dst0, 0, SRC_PIXELS * sizeof(dst0[0]));
> --
> 2.37.0.170.g444d1eabd0-goog
>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-07-18 Thread Alan Kelly

Hi Michael,

I have tried to recreate this locally in a clean client applying the
patches as sent in the email thread. I have tried gcc and mingw and this
passes for me. Are you sure you applied both patches 3 & 4? If only patch 4
is applied, then I get the error you have.

Thanks,

Alan



On Sat, Jul 16, 2022 at 1:14 PM Michael Niedermayer 
wrote:

> On Fri, Jul 15, 2022 at 05:03:56PM +0200, Alan Kelly wrote:
> > Hi Michael,
> >
> > Thanks for looking at this. I fixed the test issue.
>
> seems to be still failing here:
> make distclean ; ./configure && make -j32  tests/checkasm/checkasm &&
> tests/checkasm/checkasm --test=sw_scale
> checkasm: using random seed 1328711543
> MMXEXT:
>  - sw_scale.yuv2yuvX [OK]
> SSE2:
>  - sw_scale.hscale   [OK]
> SSE3:
>  - sw_scale.yuv2yuvX [OK]
> SSSE3:
>  - sw_scale.hscale   [OK]
> SSE4.1:
>  - sw_scale.hscale   [OK]
> AVX2:
>hscale_8_to_15__fs_4_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_4_dstW_24_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_8_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_8_dstW_24_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_12_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_12_dstW_24_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_16_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_16_dstW_24_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_32_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_32_dstW_24_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_40_dstW_8_avx2 (sw_scale.c:235)
>hscale_8_to_15__fs_40_dstW_24_avx2 (sw_scale.c:235)
>  - sw_scale.hscale   [FAILED]
>  - sw_scale.yuv2yuvX [OK]
> checkasm: 12 of 504 tests have failed
>
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> It is a danger to trust the dream we wish for rather than
> the science we have, -- Dr. Kenneth Brown
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 4/5] libswscale: Enable hscale_avx2 for all input sizes.

2022-08-15 Thread Alan Kelly

Hi Michael,

Is there anything blocking this change being applied? Is there anything I
can do to help?

Thanks,

Alan

On Mon, Jul 18, 2022 at 6:49 PM Michael Niedermayer 
wrote:

> On Mon, Jul 18, 2022 at 09:54:39AM +0200, Alan Kelly wrote:
> > Hi Michael,
> >
> > I have tried to recreate this locally in a clean client applying the
> > patches as sent in the email thread. I have tried gcc and mingw and this
> > passes for me. Are you sure you applied both patches 3 & 4? If only
> patch 4
> > is applied, then I get the error you have.
>
> ive retested, and i cannot reproduce, i think i had #4 & #5 not #3 and #4
> applied
>
> thx
>
> [...]
> --
> Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
>
> Those who are too smart to engage in politics are punished by being
> governed by those who are dumber. -- Plato
> ___
> ffmpeg-devel mailing list
> ffmpeg-devel@ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
> To unsubscribe, visit link above, or email
> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext

2022-08-17 Thread Alan Kelly

---
 libswscale/x86/swscale.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 32d441245d..881a4b7798 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -211,7 +211,7 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 if(pixelsProcessed > 0) \
 ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
-  ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
+  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
 } \
 return; \
 }
-- 
2.37.1.595.g718a3a8f04-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext

2022-08-17 Thread Alan Kelly

---
 Call yuv2yuvX_mmxext on line 208 also.
 libswscale/x86/swscale.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 32d441245d..e0f90d5c58 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -205,13 +205,13 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 int remainder = (dstW % step); \
 int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
-yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
+yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
 if(pixelsProcessed > 0) \
 ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
-  ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
+  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
 } \
 return; \
 }
-- 
2.37.1.595.g718a3a8f04-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] sws: Replace call to yuv2yuvX_mmx by yuv2yuvX_mmxext

2022-08-17 Thread Alan Kelly

---
 Remove yuv2yuvX_mmx as it is no longer used.
 libswscale/x86/swscale.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 32d441245d..89ef9f5d2b 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -205,20 +205,17 @@ static void yuv2yuvX_ ##opt(const int16_t *filter, int 
filterSize, \
 int remainder = (dstW % step); \
 int pixelsProcessed = dstW - remainder; \
 if(((uintptr_t)dest) & 15){ \
-yuv2yuvX_mmx(filter, filterSize, src, dest, dstW, dither, offset); \
+yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset); \
 return; \
 } \
 if(pixelsProcessed > 0) \
 ff_yuv2yuvX_ ##opt(filter, filterSize - 1, 0, dest - offset, 
pixelsProcessed + offset, dither, offset); \
 if(remainder > 0){ \
-  ff_yuv2yuvX_mmx(filter, filterSize - 1, pixelsProcessed, dest - offset, 
pixelsProcessed + remainder + offset, dither, offset); \
+  ff_yuv2yuvX_mmxext(filter, filterSize - 1, pixelsProcessed, dest - 
offset, pixelsProcessed + remainder + offset, dither, offset); \
 } \
 return; \
 }
 
-#if HAVE_MMX_EXTERNAL
-YUV2YUVX_FUNC_MMX(mmx, 16)
-#endif
 #if HAVE_MMXEXT_EXTERNAL
 YUV2YUVX_FUNC_MMX(mmxext, 16)
 #endif
-- 
2.37.1.595.g718a3a8f04-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2] checkasm: sw_scale: Produce more realistic test filter coefficients for yuv2yuvX

2022-08-18 Thread Alan Kelly

Thanks Martin for doing this.

On Thu, Aug 18, 2022 at 10:16 AM Martin Storsjö  wrote:

> This avoids triggering overflows in the filters, and avoids stray
> test failures in the approximate functions on x86; due to rounding
> differences, one implementation might overflow while another one
> doesn't.
>
> Signed-off-by: Martin Storsjö 
> ---
> FWIW, this modification runs successfully with over 1000 different
> seeds in checkasm.
> ---
>  tests/checkasm/sw_scale.c | 16 +++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
> index d72506ed86..ec06eafebe 100644
> --- a/tests/checkasm/sw_scale.c
> +++ b/tests/checkasm/sw_scale.c
> @@ -188,7 +188,6 @@ static void check_yuv2yuvX(int accurate)
>  uint8_t d_val = rnd();
>  memset(dither, d_val, LARGEST_INPUT_SIZE);
>  randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER *
> LARGEST_INPUT_SIZE * sizeof(int16_t));
> -randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER *
> sizeof(int16_t));
>  ctx = sws_alloc_context();
>  if (accurate)
>  ctx->flags |= SWS_ACCURATE_RND;
> @@ -202,6 +201,21 @@ static void check_yuv2yuvX(int accurate)
>  if (dstW <= osi)
>  continue;
>  for (fsi = 0; fsi < FILTER_SIZES; ++fsi) {
> +// Generate filter coefficients for the given filter size,
> +// with some properties:
> +// - The coefficients add up to the intended sum (4096,
> 1<<12)
> +// - The coefficients contain negative values
> +// - The filter intermediates don't overflow for worst
> case
> +//   inputs (all positive coefficients are coupled with
> +//   input_max and all negative coefficients with
> input_min,
> +//   or vice versa).
> +// Produce a filter with all coefficients set to
> +// -((1<<12)/(filter_size-1)) except for one (randomly
> chosen)
> +// which is set to ((1<<13)-1).
> +for (i = 0; i < filter_sizes[fsi]; ++i)
> +filter_coeff[i] = -((1 << 12) / (filter_sizes[fsi] -
> 1));
> +filter_coeff[rnd() % filter_sizes[fsi]] = (1 << 13) - 1;
> +
>  src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]);
>  vFilterData = av_malloc((filter_sizes[fsi] + 2) *
> sizeof(union VFilterData));
>  memset(vFilterData, 0, (filter_sizes[fsi] + 2) *
> sizeof(union VFilterData));
> --
> 2.25.1
>
>
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

[FFmpeg-devel] [PATCH] sws: Don't compile yuv2yuvX for mmx

2022-08-19 Thread Alan Kelly

---
 libswscale/x86/yuv2yuvX.asm | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libswscale/x86/yuv2yuvX.asm b/libswscale/x86/yuv2yuvX.asm
index b6294cb919..d5b03495fd 100644
--- a/libswscale/x86/yuv2yuvX.asm
+++ b/libswscale/x86/yuv2yuvX.asm
@@ -124,8 +124,6 @@ cglobal yuv2yuvX, 7, 7, 8, filter, filterSize, src, dest, 
dstW, dither, offset
 REP_RET
 %endmacro
 
-INIT_MMX mmx
-YUV2YUVX_FUNC
 INIT_MMX mmxext
 YUV2YUVX_FUNC
 INIT_XMM sse3
-- 
2.37.2.609.g9ff673ca1a-goog

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

1 2 >

1 - 100 of 109 matches

Mail list logo