On Tue, 21 Jun 2022, Martin Storsjö wrote:

On Mon, 13 Jun 2022, Swinney, Jonathan wrote:

- added a test for yuv2plane1 (currently disabled for x86_64)

What's the reason for having it disabled for x86 - is it another case where the current implementations there aren't bitexact? Could we avoid that by setting the bitexact flag for the new yuv2yuv1 test?

- fixed test for yuv2planeX for aarch64 which was previously not working at all

Could we make the test fuzzy and allow minor differences from the reference, when the bitexact flag isn't set, and separately test with the bitexact flag and require exact matches?

@@ -95,7 +210,7 @@ static void check_yuv2yuvX(void)
    ff_sws_init_scale(ctx);
    for(isi = 0; isi < INPUT_SIZES; ++isi){
        dstW = input_sizes[isi];
-        for(osi = 0; osi < 64; osi += 16){
+        for(osi = 0; osi < 1; osi += 16){

This looks like a stray leftover change?

I had a look at this, trying to fix things up. This now passes tests on x86_32, x86_64 and aarch64. See the attached patch, which goes on top of yours.

It's not intended as a final version of how things should be necessarily, but as a more concrete pointer about how it could be done - it needs at least reindenting after adding the outer for loop.

I also had to skip the filter sizes 1 and 3 in check_yuv2yuvX, because ff_yuv2planeX_8_sse2 couldn't handle those. I presume that means that in practice, those aren't ever used?

// Martin
From e6d33cc199a879060e8e954e6eaea8aebe96b433 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <mar...@martin.st>
Date: Wed, 22 Jun 2022 12:16:15 +0300
Subject: [PATCH] checkasm: sw_scale: Fixups

Changes:
- Omit x86 yuv2plane1 if SWS_ACCURATE_RND is set
- Remove yuv2plane1_8_ref from the testcase; this function can
  be tested with the usual call_ref pattern
- Add testing with and without SWS_ACCURATE_RND in both check_yuv2yuvX
  and check_yuv2yuv1
- Remove the x86 specific reference function in checkasm; use the
  generic reference function which matches swscale's C code, and
  compare with a tolerance of 2 unless SWS_ACCURATE_RND is set
- Reinstate testing of offsets in check_yuv2yuvX
- Make all tests arch independent
- Clarify why we can't use call_ref in check_yuv2yuvX

The code isn't properly indented (I added an outer for loop, without
reindenting the loop body, for patch clarity).
---
 libswscale/x86/swscale.c  |  11 ++--
 tests/checkasm/sw_scale.c | 113 +++++++++++++++-----------------------
 2 files changed, 50 insertions(+), 74 deletions(-)

diff --git a/libswscale/x86/swscale.c b/libswscale/x86/swscale.c
index 73869355b8..1c57b3a700 100644
--- a/libswscale/x86/swscale.c
+++ b/libswscale/x86/swscale.c
@@ -550,7 +550,8 @@ switch(c->dstBpc){ \
     if (EXTERNAL_MMX(cpu_flags)) {
         ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
         ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
-        ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
+        if (!(c->flags & SWS_ACCURATE_RND))
+            ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
 
         switch (c->srcFormat) {
         case AV_PIX_FMT_YA8:
@@ -599,7 +600,8 @@ switch(c->dstBpc){ \
         ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
                             HAVE_ALIGNED_STACK || ARCH_X86_64);
-        ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
+        if (!(c->flags & SWS_ACCURATE_RND))
+            ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
 
         switch (c->srcFormat) {
         case AV_PIX_FMT_YA8:
@@ -648,14 +650,15 @@ switch(c->dstBpc){ \
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
                             if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
                             HAVE_ALIGNED_STACK || ARCH_X86_64);
-        if (c->dstBpc == 16 && !isBE(c->dstFormat))
+        if (c->dstBpc == 16 && !isBE(c->dstFormat) && !(c->flags & SWS_ACCURATE_RND))
             c->yuv2plane1 = ff_yuv2plane1_16_sse4;
     }
 
     if (EXTERNAL_AVX(cpu_flags)) {
         ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
                             HAVE_ALIGNED_STACK || ARCH_X86_64);
-        ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
+        if (!(c->flags & SWS_ACCURATE_RND))
+            ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
 
         switch (c->srcFormat) {
         case AV_PIX_FMT_YUYV422:
diff --git a/tests/checkasm/sw_scale.c b/tests/checkasm/sw_scale.c
index edce3cbe4e..0c19974034 100644
--- a/tests/checkasm/sw_scale.c
+++ b/tests/checkasm/sw_scale.c
@@ -39,45 +39,25 @@ static void yuv2planeX_8_ref(const int16_t *filter, int filterSize,
                              const int16_t **src, uint8_t *dest, int dstW,
                              const uint8_t *dither, int offset)
 {
-#if ARCH_X86_64
-    // This reference function is the same approximate algorithm employed by the
-    // SIMD functions on x86.
-    int i, d;
-    d = ((filterSize - 1) * 8 + dither[0]) >> 4;
-    for ( i = 0; i < dstW; i++) {
-        int16_t val = d;
-        int j;
-        union {
-            int val;
-            int16_t v[2];
-        } t;
-        for (j = 0; j < filterSize; j++){
-            t.val = (int)src[j][i + offset] * (int)filter[j];
-            val += t.v[1];
-        }
-        dest[i]= av_clip_uint8(val>>3);
-    }
-#else
-    // Other architectures use the default implementation as the reference.
+    // This corresponds to the yuv2planeX_8_c function
     int i;
-    for (i=0; i<dstW; i++) {
+    for (i = 0; i < dstW; i++) {
         int val = dither[(i + offset) & 7] << 12;
         int j;
-        for (j=0; j<filterSize; j++)
+        for (j = 0; j < filterSize; j++)
             val += src[j][i] * filter[j];
 
-        dest[i]= av_clip_uint8(val>>19);
+        dest[i]= av_clip_uint8(val >> 19);
     }
-#endif
 }
-static void yuv2plane1_8_ref(const int16_t *src, uint8_t *dest, int dstW,
-                             const uint8_t *dither, int offset)
+
+static int cmp_off_by_n(const uint8_t *ref, const uint8_t *test, size_t n, int accuracy)
 {
-    int i;
-    for (i=0; i<dstW; i++) {
-        int val = (src[i] + dither[(i + offset) & 7]) >> 7;
-        dest[i]= av_clip_uint8(val);
+    for (size_t i = 0; i < n; i++) {
+        if (abs(ref[i] - test[i]) > accuracy)
+            return 1;
     }
+    return 0;
 }
 
 static void print_data(uint8_t *p, size_t len, size_t offset)
@@ -85,7 +65,7 @@ static void print_data(uint8_t *p, size_t len, size_t offset)
     size_t i = 0;
     for (; i < len; i++) {
         if (i % 8 == 0) {
-            printf("0x%04lx: ", i+offset);
+            printf("0x%04zx: ", i+offset);
         }
         printf("0x%02x ", (uint32_t) p[i]);
         if (i % 8 == 7) {
@@ -140,7 +120,10 @@ static void check_yuv2yuv1(void)
 
     randomize_buffers((uint8_t*)dither, 8);
     randomize_buffers((uint8_t*)src_pixels, LARGEST_INPUT_SIZE * sizeof(int16_t));
+  for (int accurate = 0; accurate <= 1; accurate++) {
     ctx = sws_alloc_context();
+    if (accurate)
+        ctx->flags |= SWS_ACCURATE_RND;
     if (sws_init_context(ctx, NULL, NULL) < 0)
         fail();
 
@@ -149,13 +132,13 @@ static void check_yuv2yuv1(void)
         dstW = input_sizes[isi];
         for(osi = 0; osi < OFFSET_SIZES; osi++){
             offset = offsets[osi];
-            if (check_func(ctx->yuv2plane1, "yuv2yuv1_%d_%d", offset, dstW)){
+            if (check_func(ctx->yuv2plane1, "yuv2yuv1_%d_%d%s", offset, dstW, accurate ? "_accurate" : "")){
                 memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                 memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-                yuv2plane1_8_ref(src_pixels, dst0, dstW, dither, offset);
+                call_ref(src_pixels, dst0, dstW, dither, offset);
                 call_new(src_pixels, dst1, dstW, dither, offset);
-                if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
+                if (cmp_off_by_n(dst0, dst1, dstW * sizeof(dst0[0]), accurate ? 0 : 2)) {
                     fail();
                     printf("failed: yuv2yuv1_%d_%d\n", offset, dstW);
                     fail_offset = show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
@@ -171,6 +154,7 @@ static void check_yuv2yuv1(void)
         }
     }
     sws_freeContext(ctx);
+  }
 }
 
 static void check_yuv2yuvX(void)
@@ -179,7 +163,8 @@ static void check_yuv2yuvX(void)
     int fsi, osi, isi, i, j;
     int dstW;
 #define LARGEST_FILTER 16
-    const int filter_sizes[] = {1, 2, 3, 4, 8, 16};
+    // ff_yuv2planeX_8_sse2 can't handle odd filter sizes
+    const int filter_sizes[] = {2, 4, 8, 16};
     const int FILTER_SIZES = sizeof(filter_sizes)/sizeof(filter_sizes[0]);
 #define LARGEST_INPUT_SIZE 512
     static const int input_sizes[] = {8, 24, 128, 144, 256, 512};
@@ -203,58 +188,47 @@ static void check_yuv2yuvX(void)
     memset(dither, d_val, LARGEST_INPUT_SIZE);
     randomize_buffers((uint8_t*)src_pixels, LARGEST_FILTER * LARGEST_INPUT_SIZE * sizeof(int16_t));
     randomize_buffers((uint8_t*)filter_coeff, LARGEST_FILTER * sizeof(int16_t));
+  for (int accurate = 0; accurate <= 1; accurate++) {
     ctx = sws_alloc_context();
+    if (accurate)
+        ctx->flags |= SWS_ACCURATE_RND;
     if (sws_init_context(ctx, NULL, NULL) < 0)
         fail();
 
     ff_sws_init_scale(ctx);
     for(isi = 0; isi < INPUT_SIZES; ++isi){
         dstW = input_sizes[isi];
-        for(osi = 0; osi < 1; osi += 16){
+        for(osi = 0; osi < 64; osi += 16){
+            if (dstW <= osi)
+                continue;
             for(fsi = 0; fsi < FILTER_SIZES; ++fsi){
                 src = av_malloc(sizeof(int16_t*) * filter_sizes[fsi]);
                 vFilterData = av_malloc((filter_sizes[fsi] + 2) * sizeof(union VFilterData));
                 memset(vFilterData, 0, (filter_sizes[fsi] + 2) * sizeof(union VFilterData));
                 for(i = 0; i < filter_sizes[fsi]; ++i){
                     src[i] = &src_pixels[i * LARGEST_INPUT_SIZE];
-                    vFilterData[i].src = src[i];
+                    vFilterData[i].src = src[i] - osi;
                     for(j = 0; j < 4; ++j)
                         vFilterData[i].coeff[j + 4] = filter_coeff[i];
                 }
-                if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d", filter_sizes[fsi], osi, dstW)){
+                if (check_func(ctx->yuv2planeX, "yuv2yuvX_%d_%d_%d%s", filter_sizes[fsi], osi, dstW, accurate ? "_accurate" : "")){
+                    const int16_t *filter = ctx->use_mmx_vfilter ? (const int16_t*)vFilterData : &filter_coeff[0];
                     memset(dst0, 0, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                     memset(dst1, 0, LARGEST_INPUT_SIZE * sizeof(dst1[0]));
 
-                    if (ARCH_X86_64) {
-                        // The reference function is not the scalar function selected when mmx
-                        // is deactivated as the SIMD functions do not give the same result as
-                        // the scalar ones due to rounding. The SIMD functions are activated by
-                        // the flag SWS_ACCURATE_RND
-                        yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
-                        // There's no point in calling new for the reference function
-                        if(ctx->use_mmx_vfilter) {
-                            call_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
-                            if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
-                                fail();
-                                printf("failed: yuv2yuvX_%d_%d_%d\n", filter_sizes[fsi], osi, dstW);
-                                show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
-                            }
-                            if(dstW == LARGEST_INPUT_SIZE)
-                                bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
-                        }
-                    }
+                    // We can't use call_ref here, because we don't know if use_mmx_vfilter was set for that
+                    // function or not, so we can't pass it the parameters correctly.
+                    yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
 
-                    if (ARCH_AARCH64) {
-                        yuv2planeX_8_ref(&filter_coeff[0], filter_sizes[fsi], src, dst0, dstW - osi, dither, osi);
-                        call_new(&filter_coeff[0], filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
-                        if (memcmp(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]))) {
-                            fail();
-                            printf("failed: yuv2yuvX_%d_%d_%d\n", filter_sizes[fsi], osi, dstW);
-                            show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
-                        }
-                        if(dstW == LARGEST_INPUT_SIZE)
-                            bench_new(&filter_coeff[0], filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                    call_new(filter, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+                    if (cmp_off_by_n(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]), accurate ? 0 : 2)) {
+                        fail();
+                        printf("failed: yuv2yuvX_%d_%d_%d%s\n", filter_sizes[fsi], osi, dstW, accurate ? "_accurate" : "");
+                        show_differences(dst0, dst1, LARGEST_INPUT_SIZE * sizeof(dst0[0]));
                     }
+                    if(dstW == LARGEST_INPUT_SIZE)
+                        bench_new((const int16_t*)vFilterData, filter_sizes[fsi], src, dst1, dstW - osi, dither, osi);
+
                 }
                 av_freep(&src);
                 av_freep(&vFilterData);
@@ -262,6 +236,7 @@ static void check_yuv2yuvX(void)
         }
     }
     sws_freeContext(ctx);
+  }
 #undef FILTER_SIZES
 }
 
@@ -377,10 +352,8 @@ void checkasm_check_sw_scale(void)
 {
     check_hscale();
     report("hscale");
-    if (!ARCH_X86_64) {
-        check_yuv2yuv1();
-        report("yuv2yuv1");
-    }
+    check_yuv2yuv1();
+    report("yuv2yuv1");
     check_yuv2yuvX();
     report("yuv2yuvX");
 }
-- 
2.32.0 (Apple Git-132)

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to