On Sun, Aug 18, 2024 at 10:43 PM Martin Storsjö <[email protected]> wrote:
> On Sun, 18 Aug 2024, Ramiro Polla wrote:
>
> > A53 A76
> > pix_norm1_c: 519.2 231.5
> > pix_norm1_neon: 195.0 ( 2.66x) 44.2 ( 5.24x)
> > pix_sum_c: 344.5 242.2
> > pix_sum_neon: 119.0 ( 2.89x) 41.7 ( 5.81x)
> > ---
>
> Hmm, those speedups on the A53 look quite small. I guess that's because
> this isn't unrolled at all, as you mention. Especially for A53, I would
> expect unrolling to have a very large effect here. But it sounds weird if
> you say perf indicates that it is slower in real world use. Yes, unrolling
> does make the code use more space and makes the I-cache less efficient,
> but in this case it would only be a difference of like 2 instructions?
These are the checkasm benchmarks I got for the unrolled version with
manual instruction ordering to give better results on the A53 (patch
attached for reference):
A53 A76
pix_norm1_c: 519.0 231.7
pix_norm1_neon: 140.0 ( 3.71x) 41.5 ( 5.58x)
pix_norm1_dotprod: 17.2 (13.47x)
pix_sum_c: 347.2 242.0
pix_sum_neon: 72.0 ( 4.82x) 21.0 (11.52x)
I had tested the real world case on the A76, but not on the A53. I
spent a couple of hours with perf trying to find the source of the
discrepancy but I couldn't find anything conclusive. I need to learn
more about how to test cache misses.
I just tested again with the following command:
$ taskset -c 2 ./ffmpeg_g -benchmark -f lavfi -i
"testsrc2=size=1920x1080" -vcodec mpeg4 -q 31 -vframes 100 -f rawvideo
-y /dev/null
The entire test was about 1% faster unrolled on A53, but about 1%
slower unrolled on A76 (I had the Raspberry Pi 5 in mind for these
optimizations, so I preferred choosing the version that was faster on
the A76). I wonder if there is any way we could check at runtime. The
problem is also that I don't even know for certain what is causing
this.
> > diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > new file mode 100644
> > index 0000000000..89e50e29b3
> > --- /dev/null
> > +++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
> > @@ -0,0 +1,67 @@
> > +/*
> > + * Copyright (c) 2024 Ramiro Polla
> > + *
> > + * This file is part of FFmpeg.
> > + *
> > + * FFmpeg is free software; you can redistribute it and/or
> > + * modify it under the terms of the GNU Lesser General Public
> > + * License as published by the Free Software Foundation; either
> > + * version 2.1 of the License, or (at your option) any later version.
> > + *
> > + * FFmpeg is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + * Lesser General Public License for more details.
> > + *
> > + * You should have received a copy of the GNU Lesser General Public
> > + * License along with FFmpeg; if not, write to the Free Software
> > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
> > 02110-1301 USA
> > + */
> > +
> > +#include "libavutil/aarch64/asm.S"
> > +
> > +function ff_pix_sum16_neon, export=1
> > +// x0 const uint8_t *pix
> > +// x1 int line_size
> > +
> > + sxtw x1, w1
> > + movi v0.16b, #0
> > + mov w2, #16
> > +
> > +1:
> > + ld1 { v1.16b }, [x0], x1
>
> Nit; we usually don't have these {} written with spaces inside of the
> braces, same below.
Oops, I should check my other neon code then...
> > + subs w2, w2, #1
> > + uadalp v0.8h, v1.16b
> > + b.ne 1b
> > +
> > + uaddlp v0.4s, v0.8h
> > + uaddlv d0, v0.4s
>
> Couldn't this be aggregated with just one instruction, "uaddlv s0, v0.8h"?
> There's no need to widen it to 64 bit as we're truncating the returned
> value to 32 bit anyway.
Yes, that works. I'll fix it in the next iteration.
commit efe57bd2270d0430265ab34c1d035c6738d77889
Author: Ramiro Polla <[email protected]>
Date: Wed Aug 7 18:54:57 2024 +0200
avcodec/aarch64/mpegvideoencdsp: add neon pix_sum and pix_norm1
A53 A76
pix_norm1_c: 519.0 231.7
pix_norm1_neon: 140.0 ( 3.71x) 41.5 ( 5.58x)
pix_sum_c: 347.2 242.0
pix_sum_neon: 72.0 ( 4.82x) 21.0 (11.52x)
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index a3256bb1cc..de0653ebbc 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -10,6 +10,7 @@ OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_init_aarch64.o
OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_init_aarch64.o
OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_init_aarch64.o
OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o
+OBJS-$(CONFIG_MPEGVIDEOENC) += aarch64/mpegvideoencdsp_init.o
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o
OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_init_aarch64.o
OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o
@@ -51,6 +52,7 @@ NEON-OBJS-$(CONFIG_IDCTDSP) += aarch64/idctdsp_neon.o \
aarch64/simple_idct_neon.o
NEON-OBJS-$(CONFIG_ME_CMP) += aarch64/me_cmp_neon.o
NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o
+NEON-OBJS-$(CONFIG_MPEGVIDEOENC) += aarch64/mpegvideoencdsp_neon.o
NEON-OBJS-$(CONFIG_PIXBLOCKDSP) += aarch64/pixblockdsp_neon.o
NEON-OBJS-$(CONFIG_VC1DSP) += aarch64/vc1dsp_neon.o
NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o
diff --git a/libavcodec/aarch64/mpegvideoencdsp_init.c b/libavcodec/aarch64/mpegvideoencdsp_init.c
new file mode 100644
index 0000000000..7eb632ed1b
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_init.c
@@ -0,0 +1,39 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/mpegvideoencdsp.h"
+#include "config.h"
+
+int ff_pix_sum16_neon(const uint8_t *pix, int line_size);
+int ff_pix_norm1_neon(const uint8_t *pix, int line_size);
+
+av_cold void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+ AVCodecContext *avctx)
+{
+ int cpu_flags = av_get_cpu_flags();
+
+ if (have_neon(cpu_flags)) {
+ c->pix_sum = ff_pix_sum16_neon;
+ c->pix_norm1 = ff_pix_norm1_neon;
+ }
+}
diff --git a/libavcodec/aarch64/mpegvideoencdsp_neon.S b/libavcodec/aarch64/mpegvideoencdsp_neon.S
new file mode 100644
index 0000000000..0a68116971
--- /dev/null
+++ b/libavcodec/aarch64/mpegvideoencdsp_neon.S
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2024 Ramiro Polla
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+function ff_pix_sum16_neon, export=1
+// x0 const uint8_t *pix
+// x1 int line_size
+
+ add x2, x0, w1, sxtw
+ lsl w1, w1, #1
+
+ ld1 { v1.16b }, [x0], x1
+ ld1 { v2.16b }, [x2], x1
+ ld1 { v3.16b }, [x0], x1
+ uaddlp v0.8h, v1.16b
+ ld1 { v1.16b }, [x2], x1
+ uadalp v0.8h, v2.16b
+ ld1 { v2.16b }, [x0], x1
+ uadalp v0.8h, v3.16b
+ ld1 { v3.16b }, [x2], x1
+ uadalp v0.8h, v1.16b
+ ld1 { v1.16b }, [x0], x1
+ uadalp v0.8h, v2.16b
+ ld1 { v2.16b }, [x2], x1
+ uadalp v0.8h, v3.16b
+ ld1 { v3.16b }, [x0], x1
+ uadalp v0.8h, v1.16b
+ ld1 { v1.16b }, [x2], x1
+ uadalp v0.8h, v2.16b
+ ld1 { v2.16b }, [x0], x1
+ uadalp v0.8h, v3.16b
+ ld1 { v3.16b }, [x2], x1
+ uadalp v0.8h, v1.16b
+ ld1 { v1.16b }, [x0], x1
+ uadalp v0.8h, v2.16b
+ ld1 { v2.16b }, [x2], x1
+ uadalp v0.8h, v3.16b
+ ld1 { v3.16b }, [x0], x1
+ uadalp v0.8h, v1.16b
+ ld1 { v1.16b }, [x2]
+ uadalp v0.8h, v2.16b
+ uadalp v0.8h, v3.16b
+ uadalp v0.8h, v1.16b
+
+ uaddlp v0.4s, v0.8h
+ uaddlv d0, v0.4s
+ fmov w0, s0
+
+ ret
+endfunc
+
+function ff_pix_norm1_neon, export=1
+// x0 const uint8_t *pix
+// x1 int line_size
+
+ add x2, x0, w1, sxtw
+ lsl w1, w1, #1
+
+ ld1 { v1.16b }, [x0], x1
+ ld1 { v2.16b }, [x2], x1
+ ld1 { v3.16b }, [x0], x1
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ ld1 { v1.16b }, [x2], x1
+ uaddlp v6.4s, v4.8h
+ uaddlp v7.4s, v5.8h
+ umull v4.8h, v2.8b, v2.8b
+ umull2 v5.8h, v2.16b, v2.16b
+ ld1 { v2.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v3.8b, v3.8b
+ umull2 v5.8h, v3.16b, v3.16b
+ ld1 { v3.16b }, [x2], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ ld1 { v1.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v2.8b, v2.8b
+ umull2 v5.8h, v2.16b, v2.16b
+ ld1 { v2.16b }, [x2], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v3.8b, v3.8b
+ umull2 v5.8h, v3.16b, v3.16b
+ ld1 { v3.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ ld1 { v1.16b }, [x2], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v2.8b, v2.8b
+ umull2 v5.8h, v2.16b, v2.16b
+ ld1 { v2.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v3.8b, v3.8b
+ umull2 v5.8h, v3.16b, v3.16b
+ ld1 { v3.16b }, [x2], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ ld1 { v1.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v2.8b, v2.8b
+ umull2 v5.8h, v2.16b, v2.16b
+ ld1 { v2.16b }, [x2], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v3.8b, v3.8b
+ umull2 v5.8h, v3.16b, v3.16b
+ ld1 { v3.16b }, [x0], x1
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ ld1 { v1.16b }, [x2]
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v2.8b, v2.8b
+ umull2 v5.8h, v2.16b, v2.16b
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v3.8b, v3.8b
+ umull2 v5.8h, v3.16b, v3.16b
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+ umull v4.8h, v1.8b, v1.8b
+ umull2 v5.8h, v1.16b, v1.16b
+ uadalp v6.4s, v4.8h
+ uadalp v7.4s, v5.8h
+
+ add v0.4s, v6.4s, v7.4s
+ uaddlv d0, v0.4s
+ fmov w0, s0
+
+ ret
+endfunc
diff --git a/libavcodec/mpegvideoencdsp.c b/libavcodec/mpegvideoencdsp.c
index 997d048663..a96f0b6436 100644
--- a/libavcodec/mpegvideoencdsp.c
+++ b/libavcodec/mpegvideoencdsp.c
@@ -245,7 +245,9 @@ av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
c->draw_edges = draw_edges_8_c;
-#if ARCH_ARM
+#if ARCH_AARCH64
+ ff_mpegvideoencdsp_init_aarch64(c, avctx);
+#elif ARCH_ARM
ff_mpegvideoencdsp_init_arm(c, avctx);
#elif ARCH_PPC
ff_mpegvideoencdsp_init_ppc(c, avctx);
diff --git a/libavcodec/mpegvideoencdsp.h b/libavcodec/mpegvideoencdsp.h
index 95084679d9..63dbd39603 100644
--- a/libavcodec/mpegvideoencdsp.h
+++ b/libavcodec/mpegvideoencdsp.h
@@ -46,6 +46,8 @@ typedef struct MpegvideoEncDSPContext {
void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
+void ff_mpegvideoencdsp_init_aarch64(MpegvideoEncDSPContext *c,
+ AVCodecContext *avctx);
void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c,
AVCodecContext *avctx);
void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c,
_______________________________________________
ffmpeg-devel mailing list
[email protected]
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
[email protected] with subject "unsubscribe".