On Sun, Aug 18, 2024 at 10:43 PM Martin Storsjö <mar...@martin.st> wrote:
> On Sun, 18 Aug 2024, Ramiro Polla wrote:
>
> >                   A53             A76
> > pix_norm1_c:     519.2           231.5
> > pix_norm1_neon:  195.0 ( 2.66x)   44.2 ( 5.24x)
> > pix_sum_c:       344.5           242.2
> > pix_sum_neon:    119.0 ( 2.89x)   41.7 ( 5.81x)
> > ---
>
> Hmm, those speedups on the A53 look quite small. I guess that's because
> this isn't unrolled at all, as you mention. Especially for A53, I would
> expect unrolling to have a very large effect here. But it sounds weird if
> you say perf indicates that it is slower in real world use. Yes, unrolling
> does make the code use more space and makes the I-cache less efficient,
> but in this case it would only be a difference of like 2 instructions?

These are the checkasm benchmarks I got for the unrolled version with
manual instruction ordering to give better results on the A53 (patch
attached for reference):
                      A53             A76
pix_norm1_c:        519.0           231.7
pix_norm1_neon:     140.0 ( 3.71x)   41.5 ( 5.58x)
pix_norm1_dotprod:                   17.2 (13.47x)
pix_sum_c:          347.2           242.0
pix_sum_neon:        72.0 ( 4.82x)   21.0 (11.52x)

I had tested the real world case on the A76, but not on the A53. I
spent a couple of hours with perf trying to find the source of the
discrepancy but I couldn't find anything conclusive. I need to learn
more about how to test cache misses.

I just tested again with the following command:
$ taskset -c 2 ./ffmpeg_g -benchmark -f lavfi -i
"testsrc2=size=1920x1080" -vcodec mpeg4 -q 31 -vframes 100 -f rawvideo
-y /dev/null

The entire test was about 1% faster unrolled on A53, but about 1%
slower unrolled on A76 (I had the Raspberry Pi 5 in mind for these
optimizations, so I preferred choosing the version that was faster on
the A76). I wonder if there is any way we could check at runtime. The
problem is also that I don't even know for certain what is causing
this.

> > diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S 
> > b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > new file mode 100644
> > index 0000000000..89e50e29b3
> > --- /dev/null
> > +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > @@ -0,0 +1,67 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 
> > 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_pix_sum16_neon, export=1
> > +// x0  const uint8_t *pix
> > +// x1  int line_size
> > +
> > +        sxtw            x1, w1
> > +        movi            v0.16b, #0
> > +        mov             w2, #16
> > +
> > +1:
> > +        ld1             { v1.16b }, [x0], x1
>
> Nit; we usually don't have these {} written with spaces inside of the
> braces, same below.

Oops, I should check my other neon code then...

> > +        subs            w2, w2, #1
> > +        uadalp          v0.8h, v1.16b
> > +        b.ne            1b
> > +
> > +        uaddlp          v0.4s, v0.8h
> > +        uaddlv          d0, v0.4s
>
> Couldn't this be aggregated with just one instruction, "uaddlv s0, v0.8h"?
> There's no need to widen it to 64 bit as we're truncating the returned
> value to 32 bit anyway.

Yes, that works. I'll fix it in the next iteration.
commit efe57bd2270d0430265ab34c1d035c6738d77889
Author: Ramiro Polla <ramiro.po...@gmail.com>
Date:   Wed Aug 7 18:54:57 2024 +0200

    avcodec/aarch64/mpegvideoencdsp: add neon pix_sum and pix_norm1
    
                       A53             A76
    pix_norm1_c:     519.0           231.7
    pix_norm1_neon:  140.0 ( 3.71x)   41.5 ( 5.58x)
    pix_sum_c:       347.2           242.0
    pix_sum_neon:     72.0 ( 4.82x)   21.0 (11.52x)

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index a3256bb1cc..de0653ebbc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP)                  += aarch64/hpeldsp_init_aarch64.o
 OBJS-$(CONFIG_IDCTDSP)                  += aarch64/idctdsp_init_aarch64.o
 OBJS-$(CONFIG_ME_CMP)                   += aarch64/me_cmp_init_aarch64.o
 OBJS-$(CONFIG_MPEGAUDIODSP)             += aarch64/mpegaudiodsp_init.o
+OBJS-$(CONFIG_MPEGVIDEOENC)             += aarch64/mpegvideoencdsp_init.o
 OBJS-$(CONFIG_NEON_CLOBBER_TEST)        += aarch64/neontest.o
 OBJS-$(CONFIG_PIXBLOCKDSP)              += aarch64/pixblockdsp_init_aarch64.o
 OBJS-$(CONFIG_VIDEODSP)                 += aarch64/videodsp_init.o
@@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
                                            aarch64/simple_idct_neon.o
 NEON-OBJS-$(CONFIG_ME_CMP)              += aarch64/me_cmp_neon.o
 NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_MPEGVIDEOENC)        += aarch64/mpegvideoencdsp_neon.o
 NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
 NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
 NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
new file mode 100644
index 0000000000..7eb632ed1b
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideoencdsp.h"
+#include "config.h"
+
+int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
+int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
+
+av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                             AVCodecContext *avctx)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+        c->pix_sum   = ff_pix_sum16_neon;
+        c->pix_norm1 = ff_pix_norm1_neon;
+    }
+}
diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
new file mode 100644
index 0000000000..0a68116971
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_sum16_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        add             x2, x0, w1, sxtw
+        lsl             w1, w1, #1
+
+        ld1             { v1.16b }, [x0], x1
+        ld1             { v2.16b }, [x2], x1
+        ld1             { v3.16b }, [x0], x1
+        uaddlp          v0.8h, v1.16b
+        ld1             { v1.16b }, [x2], x1
+        uadalp          v0.8h, v2.16b
+        ld1             { v2.16b }, [x0], x1
+        uadalp          v0.8h, v3.16b
+        ld1             { v3.16b }, [x2], x1
+        uadalp          v0.8h, v1.16b
+        ld1             { v1.16b }, [x0], x1
+        uadalp          v0.8h, v2.16b
+        ld1             { v2.16b }, [x2], x1
+        uadalp          v0.8h, v3.16b
+        ld1             { v3.16b }, [x0], x1
+        uadalp          v0.8h, v1.16b
+        ld1             { v1.16b }, [x2], x1
+        uadalp          v0.8h, v2.16b
+        ld1             { v2.16b }, [x0], x1
+        uadalp          v0.8h, v3.16b
+        ld1             { v3.16b }, [x2], x1
+        uadalp          v0.8h, v1.16b
+        ld1             { v1.16b }, [x0], x1
+        uadalp          v0.8h, v2.16b
+        ld1             { v2.16b }, [x2], x1
+        uadalp          v0.8h, v3.16b
+        ld1             { v3.16b }, [x0], x1
+        uadalp          v0.8h, v1.16b
+        ld1             { v1.16b }, [x2]
+        uadalp          v0.8h, v2.16b
+        uadalp          v0.8h, v3.16b
+        uadalp          v0.8h, v1.16b
+
+        uaddlp          v0.4s, v0.8h
+        uaddlv          d0, v0.4s
+        fmov            w0, s0
+
+        ret
+endfunc
+
+function ff_pix_norm1_neon, export=1
+// x0  const uint8_t *pix
+// x1  int line_size
+
+        add             x2, x0, w1, sxtw
+        lsl             w1, w1, #1
+
+        ld1             { v1.16b }, [x0], x1
+        ld1             { v2.16b }, [x2], x1
+        ld1             { v3.16b }, [x0], x1
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        ld1             { v1.16b }, [x2], x1
+        uaddlp          v6.4s, v4.8h
+        uaddlp          v7.4s, v5.8h
+        umull           v4.8h, v2.8b,  v2.8b
+        umull2          v5.8h, v2.16b, v2.16b
+        ld1             { v2.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v3.8b,  v3.8b
+        umull2          v5.8h, v3.16b, v3.16b
+        ld1             { v3.16b }, [x2], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        ld1             { v1.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v2.8b,  v2.8b
+        umull2          v5.8h, v2.16b, v2.16b
+        ld1             { v2.16b }, [x2], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v3.8b,  v3.8b
+        umull2          v5.8h, v3.16b, v3.16b
+        ld1             { v3.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        ld1             { v1.16b }, [x2], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v2.8b,  v2.8b
+        umull2          v5.8h, v2.16b, v2.16b
+        ld1             { v2.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v3.8b,  v3.8b
+        umull2          v5.8h, v3.16b, v3.16b
+        ld1             { v3.16b }, [x2], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        ld1             { v1.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v2.8b,  v2.8b
+        umull2          v5.8h, v2.16b, v2.16b
+        ld1             { v2.16b }, [x2], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v3.8b,  v3.8b
+        umull2          v5.8h, v3.16b, v3.16b
+        ld1             { v3.16b }, [x0], x1
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        ld1             { v1.16b }, [x2]
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v2.8b,  v2.8b
+        umull2          v5.8h, v2.16b, v2.16b
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v3.8b,  v3.8b
+        umull2          v5.8h, v3.16b, v3.16b
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+        umull           v4.8h, v1.8b,  v1.8b
+        umull2          v5.8h, v1.16b, v1.16b
+        uadalp          v6.4s, v4.8h
+        uadalp          v7.4s, v5.8h
+
+        add             v0.4s, v6.4s, v7.4s
+        uaddlv          d0, v0.4s
+        fmov            w0, s0
+
+        ret
+endfunc
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 997d048663..a96f0b6436 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -245,7 +245,9 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
 
     c->draw_edges = draw_edges_8_c;
 
-#if ARCH_ARM
+#if ARCH_AARCH64
+    ff_mpegvideoencdsp_init_aarch64(c, avctx);
+#elif ARCH_ARM
     ff_mpegvideoencdsp_init_arm(c, avctx);
 #elif ARCH_PPC
     ff_mpegvideoencdsp_init_ppc(c, avctx);
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 95084679d9..63dbd39603 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -46,6 +46,8 @@ typedef struct MpegvideoEncDSPContext {
 
 void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
                              AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+                                     AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
                                  AVCodecContext *avctx);
 void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".

Reply via email to