aarch64: add some neon pix_abs functions

Martin Storsjö Fri, 15 Apr 2022 14:14:07 -0700

On Thu, 14 Apr 2022, Swinney, Jonathan wrote:

- ff_pix_abs16_neon
- ff_pix_abs16_xy2_neon


In direct micro benchmarks of these ff functions verses their C implementations,
these functions performed as follows on AWS Graviton 2:

ff_pix_abs16_neon:
c:  benchmark ran 100000 iterations in 0.955383 seconds
ff: benchmark ran 100000 iterations in 0.097669 seconds

ff_pix_abs16_xy2_neon:
c:  benchmark ran 100000 iterations in 1.916759 seconds
ff: benchmark ran 100000 iterations in 0.370729 seconds

It's generally preferred to include the numbers from checkasm --bench forthese functions. You can execute it with e.g. "checkasm --bench=pix_fmt--test=motion" to run only the relevant tests and benchmark some specificfunction.

Also for the checkasm test; generally I'd suggest looking closer at someexisting test as a good example. I think e.g. vp8dsp is a decent testcaseto use as model.

Signed-off-by: Jonathan Swinney <jswin...@amazon.com>
---
libavcodec/aarch64/Makefile              |   2 +
libavcodec/aarch64/me_cmp_init_aarch64.c |  39 +++++
libavcodec/aarch64/me_cmp_neon.S         | 209 +++++++++++++++++++++++
libavcodec/me_cmp.c                      |   2 +
libavcodec/me_cmp.h                      |   1 +
libavcodec/x86/me_cmp.asm                |   7 +
libavcodec/x86/me_cmp_init.c             |   3 +
tests/checkasm/Makefile                  |   2 +-
tests/checkasm/checkasm.c                |   1 +
tests/checkasm/checkasm.h                |   1 +
tests/checkasm/motion.c                  | 155 +++++++++++++++++
11 files changed, 421 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/me_cmp_init_aarch64.c
create mode 100644 libavcodec/aarch64/me_cmp_neon.S
create mode 100644 tests/checkasm/motion.c

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 954461f81d..18869da1b4 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -7,6 +7,7 @@ OBJS-$(CONFIG_H264PRED)                 += 
aarch64/h264pred_init.o
OBJS-$(CONFIG_H264QPEL)                 += aarch64/h264qpel_init_aarch64.o
OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
+OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o

If this is gated behind a CONFIG_ME_CMP here, we should use the sameCONFIG_ME_CMP for conditionals in checkasm too.

+++ b/libavcodec/me_cmp.c
@@ -1062,6 +1062,8 @@ av_cold void ff_me_cmp_init(MECmpContext *c, 
AVCodecContext *avctx)

    if (ARCH_ALPHA)
        ff_me_cmp_init_alpha(c, avctx);
+    if (ARCH_AARCH64)
+        ff_me_cmp_init_aarch64(c, avctx);


Please add this in alphabetical order, aarch64 comes before alpha.

    if (ARCH_ARM)
        ff_me_cmp_init_arm(c, avctx);
    if (ARCH_PPC)
diff --git a/libavcodec/me_cmp.h b/libavcodec/me_cmp.h
index e9b5161c9a..2c13bb9d3b 100644
--- a/libavcodec/me_cmp.h
+++ b/libavcodec/me_cmp.h
@@ -81,6 +81,7 @@ typedef struct MECmpContext {

void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_alpha(MECmpContext *c, AVCodecContext *avctx);
+void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_arm(MECmpContext *c, AVCodecContext *avctx);


Ditto about alphabetical order

void ff_me_cmp_init_ppc(MECmpContext *c, AVCodecContext *avctx);
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/me_cmp.asm b/libavcodec/x86/me_cmp.asm
index ad06d485ab..f73b9f9161 100644
--- a/libavcodec/x86/me_cmp.asm
+++ b/libavcodec/x86/me_cmp.asm
@@ -255,6 +255,7 @@ hadamard8x8_diff %+ SUFFIX:

    HSUM                         m0, m1, eax
    and                         rax, 0xFFFF
+    emms
    ret

I think we shouldn't be changing the existing x86 functions here. Let'soriginally assume that the existing x86 functions are correct - they'reexpected to not call emms (as the code expects that to be done at a higherlevel somewhere). Therefore, the new checkasm test needs to check the emmshandling in a way which acecpts the current x86 code. Lots of checkasmtests uses "declare_func_emms(AV_CPU_FLAG_MMX, ..." which I think impliesthis intent.

diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
index f768b1144e..f542ce0768 100644
--- a/tests/checkasm/Makefile
+++ b/tests/checkasm/Makefile
@@ -30,7 +30,7 @@ AVCODECOBJS-$(CONFIG_V210_DECODER)      += v210dec.o
AVCODECOBJS-$(CONFIG_V210_ENCODER)      += v210enc.o
AVCODECOBJS-$(CONFIG_VP9_DECODER)       += vp9dsp.o

-CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes) motion.o


I guess this should use CONFIG_ME_CMP?

# libavfilter tests
AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
index f74125e810..bbfc38636c 100644
--- a/tests/checkasm/checkasm.c
+++ b/tests/checkasm/checkasm.c
@@ -155,6 +155,7 @@ static const struct {
    #if CONFIG_VIDEODSP
        { "videodsp", checkasm_check_videodsp },
    #endif
+        { "motion", checkasm_check_motion },


Ditto about a CONFIG_ME_CMP condition?

diff --git a/tests/checkasm/motion.c b/tests/checkasm/motion.c
new file mode 100644
index 0000000000..9191a35c01
--- /dev/null
+++ b/tests/checkasm/motion.c
@@ -0,0 +1,155 @@
+/*
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <string.h>
+
+#include "libavutil/common.h"
+#include "libavutil/intreadwrite.h"
+#include "libavutil/mem_internal.h"
+
+#include "libavcodec/me_cmp.h"
+#include "libavutil/cpu.h"
+
+#include "checkasm.h"
+
+int dummy;


This looks unused?

+
+#define WIDTH 64
+#define HEIGHT 64
+
+static uint8_t img1[WIDTH * HEIGHT];
+static uint8_t img2[WIDTH * HEIGHT];

These are usually stack allocated. See e.g. vp8dsp.c - they are alsonormally allocated aligned, e.g. LOCAL_ALIGNED_16(...).

+
+
+static void fill_random(uint8_t *tab, int size)
+{
+    int i;
+    AVLFG prng;
+
+    av_lfg_init(&prng, 1);

Don't use your own PRNG here, use the rnd() macro from checkasm.h. This ishooked up to the test seed that is printed when checkasm is started, whichallows covering different test combinations each time the test is executed(and can be replayed by rerunning checkasm with --seed=1234).


Overall, check other examples, e.g. vp8dsp.c, for use of the rnd() macro.

+    for(i=0;i<size;i++) {


Please add spaces around operators.

+        tab[i] = av_lfg_get(&prng) % 256;
+    }
+}
+
+static void test_motion(const char *name,
+                 me_cmp_func test_func, me_cmp_func ref_func)
+{
+    int x, y, d1, d2, it;
+    uint8_t *ptr;
+
+declare_func(int, struct MpegEncContext *c,
+             uint8_t *blk1 /* align width (8 or 16) */,
+             uint8_t *blk2 /* align 1 */, ptrdiff_t stride,
+             int h);


Maybe declare_func_emms would avoid the need for the emms changes.

+
+    if (test_func == ref_func || test_func == NULL || ref_func == NULL) {
+        return;
+    }
+
+    /* test correctness */
+    for(it=0;it<20;it++) {
+
+        fill_random(img1, WIDTH * HEIGHT);
+        fill_random(img2, WIDTH * HEIGHT);

Here, fill_random reruns things deterministically as it always reinits thePRNG in the same way, so running 20 iterations doesn't increase testcoverage. With proper use of rnd() it could work though.

+
+        if (check_func(test_func, "%s", name)) {

Don't rerun check_func() many times when testing multiple iterations; usecheck_func() once outermost, and while testing that specific function, doas many iterations as are relevant

+            for(y=0;y<HEIGHT-17;y++) {
+                for(x=0;x<WIDTH-17;x++) {

This is indeed a very exhaustive test, that's great. But if we alreadyhave such an exhaustive test we probably don't need to do 20 iterations ofit. If the test instead only tests a couple randomly chosen dimensions,doing e.g. 20 iterations sounds like a good idea.

+                    ptr = img2 + y * WIDTH + x;
+                    d2 = call_ref(NULL, img1, ptr, WIDTH, 8);
+                    d1 = call_new(NULL, img1, ptr, WIDTH, 8);
+
+                    if (d1 != d2) {
+                        fail();
+                        printf("error: mmx=%d c=%d\n", d1, d2);
+                    }
+                    bench_new(NULL, img1, ptr, WIDTH, 8);

If doing multiple iterations of the same, I would suggest not runningbench_new each of them; normally you'd have exhaustive testing usingcall_ref/call_new and checking their outputs, and then just a couple callswith bench_new to benchmark things. (In this setup, the benchmark scoreends up an average of all input size combinations. In some cases, thebenchmark is only done on the biggest dimension, or on a couple relevantcases.)

+                }
+            }
+        }
+    }
+    emms_c();
+}
+
+#define sizeof_array(ar) (sizeof(ar)/sizeof((ar)[0]))


Use FF_ARRAY_ELEMS

+
+#define ME_CMP_1D_ARRAYS(XX)                                                   
\
+    XX(sad)                                                                    
\
+    XX(sse)                                                                    
\
+    XX(hadamard8_diff)                                                         
\
+    XX(dct_sad)                                                                
\
+    XX(quant_psnr)                                                             
\
+    XX(bit)                                                                    
\
+    XX(rd)                                                                     
\
+    XX(vsad)                                                                   
\
+    XX(vsse)                                                                   
\
+    XX(nsse)                                                                   
\
+    XX(w53)                                                                    
\
+    XX(w97)                                                                    
\
+    XX(dct_max)                                                                
\
+    XX(dct264_sad)                                                             
\
+    XX(me_pre_cmp)                                                             
\
+    XX(me_cmp)                                                                 
\
+    XX(me_sub_cmp)                                                             
\
+    XX(mb_cmp)                                                                 
\
+    XX(ildct_cmp)                                                              
\
+    XX(frame_skip_cmp)                                                         
\
+    XX(median_sad)
+
+
+static void check_motion(void)
+{
+    char buf[64];
+    AVCodecContext *ctx;
+    MECmpContext c_ctx, ff_ctx;
+
+    memset(&c_ctx, 0, sizeof(c_ctx));
+    memset(&ff_ctx, 0, sizeof(ff_ctx));
+
+    /* allocate AVCodecContext */
+    ctx = avcodec_alloc_context3(NULL);
+    ctx->flags |= AV_CODEC_FLAG_BITEXACT;
+    /* clear cpu flags to get C versions of functions */
+    ff_me_cmp_init(&ff_ctx, ctx);
+    av_force_cpu_flags(0);
+    ff_me_cmp_init(&c_ctx, ctx);

No, this isn't how you do it. A test shouldn't touch the cpu flagsmanually. (This probably is what causes the surprising difference in testcounts that Michael noticed.)

On a high level, the checkasm test framework runs your test multipletimes, with the cpu mask set to all intermediate levels. So first yourtest gets the C-only version of the function. On the next time around, itgets e.g. the MMX version of the function (if any). Then later it gets anSSE2, SSSE3, etc version of the function.

The check_func() macro does the magic - it looks up (using the string key)the previous function implementation for the same key, so that thatfunction gets used as reference. So within your test you should only inityour DSP functions once, using the cpu feature mask that the testframework sets up for you.

+
+    for (int i = 0; i < sizeof_array(c_ctx.pix_abs); i++) {
+        for (int j = 0; j < sizeof_array(c_ctx.pix_abs[0]); j++) {
+            snprintf(buf, sizeof(buf), "pix_abs_%d_%d", i, j);
+            test_motion(buf, ff_ctx.pix_abs[i][j], c_ctx.pix_abs[i][j]);
+        }
+    }
+
+#define XX(me_cmp_array)                                                       
 \
+    for (int i = 0; i < sizeof_array(c_ctx.me_cmp_array); i++) {               
 \
+        snprintf(buf, sizeof(buf), #me_cmp_array "_%d", i);                    
 \
+        test_motion(buf, ff_ctx.me_cmp_array[i], c_ctx.me_cmp_array[i]);       
 \
+    }
+    ME_CMP_1D_ARRAYS(XX)
+#undef XX
+
+}
+
+void checkasm_check_motion(void)
+{
+    check_motion();
+    report("motion");
+}
--
2.32.0

In addition to the test setup you've done, you also need to add the testto tests/fate/checkasm.mak, so that "make fate-checkasm" (and make fate)includes this new test.


// Martin

_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Re: [FFmpeg-devel] [PATCH v2 1/1] lavc/aarch64: add some neon pix_abs functions

Reply via email to