On Thu, 14 Apr 2022, Swinney, Jonathan wrote:

- ff_pix_abs16_neon
- ff_pix_abs16_xy2_neon

In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds

It's generally preferred to include the numbers from checkasm --bench for these functions. You can execute it with e.g. "checkasm --bench=pix_fmt --test=motion" to run only the relevant tests and benchmark some specific function.


Also for the checkasm test; generally I'd suggest looking closer at some existing test as a good example. I think e.g. vp8dsp is a decent testcase to use as model.

Signed-off-by: Jonathan Swinney <jswin...@amazon.com>
---
libavcodec/aarch64/Makefile              |   2 +
libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
libavcodec/me_cmp.c                      |   2 +
libavcodec/me_cmp.h                      |   1 +
libavcodec/x86/me_cmp.asm                |   7 +
libavcodec/x86/me_cmp_init.c             |   3 +
tests/checkasm/Makefile                  |   2 +-
tests/checkasm/checkasm.c                |   1 +
tests/checkasm/checkasm.h                |   1 +
tests/checkasm/motion.c                  | 155 +++++++++++++++++
11 files changed, 421 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
create mode 100644 libavcodec/aarch64/me_cmp_neon.S
create mode 100644 tests/checkasm/motion.c


diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += 
aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o

If this is gated behind a CONFIG_ME_CMP here, we should use the same CONFIG_ME_CMP for conditionals in checkasm too.

+++ b/libavcodec/me_cmp.c
@@ -1062,6 +1062,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, 
AVCodecContext *avctx)

    if (ARCH_ALPHA)
        ff_me_cmp_init_alpha(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);

Please add this in alphabetical order, aarch64 comes before alpha.

    if (ARCH_ARM)
        ff_me_cmp_init_arm(c, avctx);
    if (ARCH_PPC)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..2c13bb9d3b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -81,6 +81,7 @@ typedef struct MECmpContext {

void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);

Ditto about alphabetical order

void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ad06d485ab..f73b9f9161 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:

    HSUM                         m0, m1, eax
    and                         rax, 0xFFFF
+    emms
    ret


I think we shouldn't be changing the existing x86 functions here. Let's originally assume that the existing x86 functions are correct - they're expected to not call emms (as the code expects that to be done at a higher level somewhere). Therefore, the new checkasm test needs to check the emms handling in a way which acecpts the current x86 code. Lots of checkasm tests uses "declare_func_emms(AV_CPU_FLAG_MMX, ..." which I think implies this intent.

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..f542ce0768 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -30,7 +30,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o

-CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes) motion.o


I guess this should use CONFIG_ME_CMP?

# libavfilter tests
AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index f74125e810..bbfc38636c 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -155,6 +155,7 @@ static const struct {
    #if CONFIG_VIDEODSP
        { "videodsp", checkasm_check_videodsp },
    #endif
+        { "motion", checkasm_check_motion },

Ditto about a CONFIG_ME_CMP condition?

diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..9191a35c01
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,155 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+#include "libavutil/cpu.h"
+
+#include "checkasm.h"
+
+int dummy;

This looks unused?

+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];

These are usually stack allocated. See e.g. vp8dsp.c - they are also normally allocated aligned, e.g. LOCAL_ALIGNED_16(...).

+
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);

Don't use your own PRNG here, use the rnd() macro from checkasm.h. This is hooked up to the test seed that is printed when checkasm is started, which allows covering different test combinations each time the test is executed (and can be replayed by rerunning checkasm with --seed=1234).

Overall, check other examples, e.g. vp8dsp.c, for use of the rnd() macro.

+    for(i=0;i<size;i++) {

Please add spaces around operators.

+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+
+declare_func(int, struct MpegEncContext *c,
+             uint8_t *blk1 /* align width (8 or 16) */,
+             uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+             int h);

Maybe declare_func_emms would avoid the need for the emms changes.

+
+    if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
+        return;
+    }
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);

Here, fill_random reruns things deterministically as it always reinits the PRNG in the same way, so running 20 iterations doesn't increase test coverage. With proper use of rnd() it could work though.

+
+        if (check_func(test_func, "%s", name)) {

Don't rerun check_func() many times when testing multiple iterations; use check_func() once outermost, and while testing that specific function, do as many iterations as are relevant

+            for(y=0;y<HEIGHT-17;y++) {
+                for(x=0;x<WIDTH-17;x++) {

This is indeed a very exhaustive test, that's great. But if we already have such an exhaustive test we probably don't need to do 20 iterations of it. If the test instead only tests a couple randomly chosen dimensions, doing e.g. 20 iterations sounds like a good idea.

+                    ptr = img2 + y * WIDTH + x;
+                    d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+                    d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+                    if (d1 != d2) {
+                        fail();
+                        printf("error: mmx=%d c=%d\n", d1, d2);
+                    }
+                    bench_new(NULL, img1, ptr, WIDTH, 8);

If doing multiple iterations of the same, I would suggest not running bench_new each of them; normally you'd have exhaustive testing using call_ref/call_new and checking their outputs, and then just a couple calls with bench_new to benchmark things. (In this setup, the benchmark score ends up an average of all input size combinations. In some cases, the benchmark is only done on the biggest dimension, or on a couple relevant cases.)

+                }
+            }
+        }
+    }
+    emms_c();
+}
+
+#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))

Use FF_ARRAY_ELEMS

+
+#define ME_CMP_1D_ARRAYS(XX)                                                   
\
+    XX(sad)                                                                    
\
+    XX(sse)                                                                    
\
+    XX(hadamard8_diff)                                                         
\
+    XX(dct_sad)                                                                
\
+    XX(quant_psnr)                                                             
\
+    XX(bit)                                                                    
\
+    XX(rd)                                                                     
\
+    XX(vsad)                                                                   
\
+    XX(vsse)                                                                   
\
+    XX(nsse)                                                                   
\
+    XX(w53)                                                                    
\
+    XX(w97)                                                                    
\
+    XX(dct_max)                                                                
\
+    XX(dct264_sad)                                                             
\
+    XX(me_pre_cmp)                                                             
\
+    XX(me_cmp)                                                                 
\
+    XX(me_sub_cmp)                                                             
\
+    XX(mb_cmp)                                                                 
\
+    XX(ildct_cmp)                                                              
\
+    XX(frame_skip_cmp)                                                         
\
+    XX(median_sad)
+
+
+static void check_motion(void)
+{
+    char buf[64];
+    AVCodecContext *ctx;
+    MECmpContext c_ctx, ff_ctx;
+
+    memset(&c_ctx, 0, sizeof(c_ctx));
+    memset(&ff_ctx, 0, sizeof(ff_ctx));
+
+    /* allocate AVCodecContext */
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    /* clear cpu flags to get C versions of functions */
+    ff_me_cmp_init(&ff_ctx, ctx);
+    av_force_cpu_flags(0);
+    ff_me_cmp_init(&c_ctx, ctx);

No, this isn't how you do it. A test shouldn't touch the cpu flags manually. (This probably is what causes the surprising difference in test counts that Michael noticed.)

On a high level, the checkasm test framework runs your test multiple times, with the cpu mask set to all intermediate levels. So first your test gets the C-only version of the function. On the next time around, it gets e.g. the MMX version of the function (if any). Then later it gets an SSE2, SSSE3, etc version of the function.

The check_func() macro does the magic - it looks up (using the string key) the previous function implementation for the same key, so that that function gets used as reference. So within your test you should only init your DSP functions once, using the cpu feature mask that the test framework sets up for you.

+
+    for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
+        for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
+            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+            test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
+        }
+    }
+
+#define XX(me_cmp_array)                                                       
 \
+    for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) {               
 \
+        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                    
 \
+        test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]);       
 \
+    }
+    ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+}
+
+void checkasm_check_motion(void)
+{
+    check_motion();
+    report("motion");
+}
--
2.32.0

In addition to the test setup you've done, you also need to add the test to tests/fate/checkasm.mak, so that "make fate-checkasm" (and make fate) includes this new test.

// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to