On Sun, 4 Jun 2023, logan....@myais.com.cn wrote:
From: Logan Lyu <logan....@myais.com.cn>
Signed-off-by: Logan Lyu <logan....@myais.com.cn>
---
libavcodec/aarch64/Makefile | 1 +
libavcodec/aarch64/hevcdsp_epel_neon.S | 378 ++++++++++++++++++++++
libavcodec/aarch64/hevcdsp_init_aarch64.c | 7 +-
3 files changed, 385 insertions(+), 1 deletion(-)
create mode 100644 libavcodec/aarch64/hevcdsp_epel_neon.S
diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 216191640c..cb428b49e0 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -69,4 +69,5 @@ NEON-OBJS-$(CONFIG_HEVC_DECODER) +=
aarch64/hevcdsp_deblock_neon.o \
aarch64/hevcdsp_idct_neon.o \
aarch64/hevcdsp_init_aarch64.o \
aarch64/hevcdsp_qpel_neon.o \
+ aarch64/hevcdsp_epel_neon.o
\
aarch64/hevcdsp_sao_neon.o
diff --git a/libavcodec/aarch64/hevcdsp_epel_neon.S
b/libavcodec/aarch64/hevcdsp_epel_neon.S
new file mode 100644
index 0000000000..fe494dd843
--- /dev/null
+++ b/libavcodec/aarch64/hevcdsp_epel_neon.S
@@ -0,0 +1,378 @@
+/* -*-arm64-*-
+ * vim: syntax=arm64asm
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+#define MAX_PB_SIZE 64
+
+const epel_filters, align=4
+ .byte 0, 0, 0, 0
+ .byte -2, 58, 10, -2
+ .byte -4, 54, 16, -2
+ .byte -6, 46, 28, -4
+ .byte -4, 36, 36, -4
+ .byte -4, 28, 46, -6
+ .byte -2, 16, 54, -4
+ .byte -2, 10, 58, -2
+endconst
+
+#if HAVE_I8MM
+.macro EPEL_UNI_W_H_HEADER
+ ldr x12, [sp]
+ sub x2, x2, #1
+ movrel x9, epel_filters
+ add x9, x9, x12, lsl #2
+ ldr w11, [x9]
+ dup v28.4s, w11
Why not just do "ld1r {v28.4s}, [x9]" here instead, avoiding the
indirection via GPRs?
Other than that, I think this mostly looks reasonable.
Btw, for any assembly patches like these, it would be appreciated if you
can provide benchmarks from checkasm, e.g. "checkasm --test=hevc_pel
--bench=put_hevc" (or maybe just "--bench") and extract the relevant lines
for the functions that you've added/modified, and mention what system
you've benchmarked it on. You get the most useful benchmarks for
micro-tuning if you can enable userspace access to the timing registers
and configure with --disable-linux-perf.
// Martin
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".